From 1b40ddafb832ba79d0dd6405b8036e385a89383e Mon Sep 17 00:00:00 2001 From: ribes Date: Wed, 28 May 2008 07:51:14 +0000 Subject: [PATCH] - New Launcher with some patches for EDF PBS configuration - Batch resources are now in a different list --- bin/config_appli.xml | 24 +- bin/shutdownSalome.py | 31 +- idl/SALOME_ContainerManager.idl | 1 - src/Batch/Batch_BatchManager_eClient.cxx | 235 ++++ src/Batch/Batch_BatchManager_eClient.hxx | 73 ++ src/Batch/Batch_BatchManager_eLSF.cxx | 310 +++++ src/Batch/Batch_BatchManager_eLSF.hxx | 93 ++ src/Batch/Batch_BatchManager_ePBS.cxx | 296 +++++ src/Batch/Batch_BatchManager_ePBS.hxx | 91 ++ src/Batch/Batch_FactBatchManager_eClient.cxx | 48 + .../Batch_FactBatchManager_eClient.hxx} | 36 +- src/Batch/Batch_FactBatchManager_eLSF.cxx | 63 + src/Batch/Batch_FactBatchManager_eLSF.hxx | 60 + src/Batch/Batch_FactBatchManager_ePBS.cxx | 64 + src/Batch/Batch_FactBatchManager_ePBS.hxx | 60 + src/Batch/Batch_JobInfo_eLSF.cxx | 103 ++ src/Batch/Batch_JobInfo_eLSF.hxx | 69 ++ src/Batch/Batch_JobInfo_ePBS.cxx | 114 ++ src/Batch/Batch_JobInfo_ePBS.hxx | 69 ++ src/Batch/Batch_Parametre.cxx | 4 + src/Batch/Batch_Parametre.hxx | 2 + src/Batch/Makefile.am | 22 +- src/{Launcher => Batch}/MpiImpl.cxx | 49 +- src/{Launcher => Batch}/MpiImpl.hxx | 29 +- src/Container/Makefile.am | 1 + src/Container/SALOME_ContainerManager.cxx | 599 +++++++++- src/Container/SALOME_ContainerManager.hxx | 36 +- src/Launcher/BatchLight_BatchManager.cxx | 332 ------ src/Launcher/BatchLight_BatchManager.hxx | 91 -- src/Launcher/BatchLight_BatchManager_PBS.cxx | 476 -------- src/Launcher/BatchLight_BatchManager_PBS.hxx | 64 - .../BatchLight_BatchManager_SLURM.cxx | 340 ------ src/Launcher/BatchLight_Job.cxx | 160 --- src/Launcher/BatchLight_Job.hxx | 75 -- src/Launcher/Launcher.cxx | 625 ++++++++++ src/Launcher/Launcher.hxx | 79 ++ src/Launcher/Makefile.am | 37 +- src/Launcher/SALOME_Launcher.cxx | 183 +-- src/Launcher/SALOME_Launcher.hxx | 7 +- src/LifeCycleCORBA/TestContainerManager.cxx | 2 +- src/ResourcesManager/Makefile.am | 26 +- src/ResourcesManager/ResourcesManager.cxx | 486 ++++++++ src/ResourcesManager/ResourcesManager.hxx | 116 ++ .../SALOME_LoadRateManager.cxx | 79 +- .../SALOME_LoadRateManager.hxx | 9 +- .../SALOME_ResourcesCatalog_Handler.cxx | 12 +- .../SALOME_ResourcesCatalog_Parser.cxx | 12 +- .../SALOME_ResourcesCatalog_Parser.hxx | 4 +- .../SALOME_ResourcesManager.cxx | 1040 +---------------- .../SALOME_ResourcesManager.hxx | 77 +- 50 files changed, 3991 insertions(+), 2923 deletions(-) create mode 100644 src/Batch/Batch_BatchManager_eClient.cxx create mode 100644 src/Batch/Batch_BatchManager_eClient.hxx create mode 100644 src/Batch/Batch_BatchManager_eLSF.cxx create mode 100644 src/Batch/Batch_BatchManager_eLSF.hxx create mode 100644 src/Batch/Batch_BatchManager_ePBS.cxx create mode 100644 src/Batch/Batch_BatchManager_ePBS.hxx create mode 100644 src/Batch/Batch_FactBatchManager_eClient.cxx rename src/{Launcher/BatchLight_BatchManager_SLURM.hxx => Batch/Batch_FactBatchManager_eClient.hxx} (53%) create mode 100644 src/Batch/Batch_FactBatchManager_eLSF.cxx create mode 100644 src/Batch/Batch_FactBatchManager_eLSF.hxx create mode 100644 src/Batch/Batch_FactBatchManager_ePBS.cxx create mode 100644 src/Batch/Batch_FactBatchManager_ePBS.hxx create mode 100644 src/Batch/Batch_JobInfo_eLSF.cxx create mode 100644 src/Batch/Batch_JobInfo_eLSF.hxx create mode 100644 src/Batch/Batch_JobInfo_ePBS.cxx create mode 100644 src/Batch/Batch_JobInfo_ePBS.hxx rename src/{Launcher => Batch}/MpiImpl.cxx (84%) rename src/{Launcher => Batch}/MpiImpl.hxx (85%) delete mode 100644 src/Launcher/BatchLight_BatchManager.cxx delete mode 100644 src/Launcher/BatchLight_BatchManager.hxx delete mode 100644 src/Launcher/BatchLight_BatchManager_PBS.cxx delete mode 100644 src/Launcher/BatchLight_BatchManager_PBS.hxx delete mode 100644 src/Launcher/BatchLight_BatchManager_SLURM.cxx delete mode 100644 src/Launcher/BatchLight_Job.cxx delete mode 100644 src/Launcher/BatchLight_Job.hxx create mode 100644 src/Launcher/Launcher.cxx create mode 100644 src/Launcher/Launcher.hxx create mode 100644 src/ResourcesManager/ResourcesManager.cxx create mode 100644 src/ResourcesManager/ResourcesManager.hxx diff --git a/bin/config_appli.xml b/bin/config_appli.xml index bd7799353..2695ad7e5 100644 --- a/bin/config_appli.xml +++ b/bin/config_appli.xml @@ -1,20 +1,20 @@ - + - - - - - - - - - - + + + + + + + + + + - + diff --git a/bin/shutdownSalome.py b/bin/shutdownSalome.py index 0c96154bb..491266662 100755 --- a/bin/shutdownSalome.py +++ b/bin/shutdownSalome.py @@ -1,28 +1,5 @@ #!/usr/bin/env python -import orbmodule -import Engines -import Registry -import SALOME -import SALOMEDS -import SALOME_ModuleCatalog -clt=orbmodule.client() -obj = clt.Resolve('Kernel/Session') -if obj != None: - ses = obj._narrow(SALOME.Session) - ses.StopSession() -obj = clt.Resolve('SalomeLauncher') -if obj != None: - cm = obj._narrow(Engines.SalomeLauncher) - cm.Shutdown() -obj = clt.Resolve('Kernel/ModulCatalog') -if obj != None: - mc = obj._narrow(SALOME_ModuleCatalog.ModuleCatalog) - mc.shutdown() -obj = clt.Resolve('Registry') -if obj != None: - reg = obj._narrow(Registry.Components) - reg.Shutdown() -obj = clt.Resolve('myStudyManager') -if obj != None: - sm = obj._narrow(SALOMEDS.StudyManager) - sm.Shutdown() +import salome +salome.salome_init() +salome.lcc.shutdownServers() +salome.SALOME_LifeCycleCORBA_killOmniNames() diff --git a/idl/SALOME_ContainerManager.idl b/idl/SALOME_ContainerManager.idl index d3b9fd6b5..d4623c160 100644 --- a/idl/SALOME_ContainerManager.idl +++ b/idl/SALOME_ContainerManager.idl @@ -80,7 +80,6 @@ struct BatchParameters // has to be like this : hh:mm string mem; // Minimum of memory needed // has to be like : 32gb or 512mb - long nb_proc; // Number of processors requested }; diff --git a/src/Batch/Batch_BatchManager_eClient.cxx b/src/Batch/Batch_BatchManager_eClient.cxx new file mode 100644 index 000000000..41ce47c2e --- /dev/null +++ b/src/Batch/Batch_BatchManager_eClient.cxx @@ -0,0 +1,235 @@ +// Copyright (C) 2005 OPEN CASCADE, EADS/CCR, LIP6, CEA/DEN, +// CEDRAT, EDF R&D, LEG, PRINCIPIA R&D, BUREAU VERITAS +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License. +// +// This library is distributed in the hope that it will be useful +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// +// See http://www.salome-platform.org/ or email : webmaster.salome@opencascade.com +// +/* + * BatchManager_eLSF.cxx : emulation of LSF client + * + * Auteur : Bernard SECHER - CEA DEN + * Mail : mailto:bernard.secher@cea.fr + * Date : Thu Apr 24 10:17:22 2008 + * Projet : PAL Salome + * + */ + +#include +#include +#include +#include +#include "Batch_BatchManager_eClient.hxx" + +namespace Batch { + + BatchManager_eClient::BatchManager_eClient(const Batch::FactBatchManager * parent, const char* host, const char* protocol, const char* mpiImpl) : BatchManager(parent, host), _protocol(protocol), _username("") + { + // instanciation of mpi implementation needed to launch executable in batch script + _mpiImpl = FactoryMpiImpl(mpiImpl); + } + + // Destructeur + BatchManager_eClient::~BatchManager_eClient() + { + // Nothing to do + delete _mpiImpl; + } + + void BatchManager_eClient::exportInputFiles(const Job& job) throw(EmulationException) + { + int status; + Parametre params = job.getParametre(); + Versatile V = params[INFILE]; + Versatile::iterator Vit; + string command; + string copy_command; + _username = string(params[USER]); + + // Test protocol + if( _protocol == "rsh" ) + copy_command = "rcp "; + else if( _protocol == "ssh" ) + copy_command = "scp "; + else + throw EmulationException("Unknown protocol : only rsh and ssh are known !"); + + // First step : creating batch tmp files directory + command = _protocol; + command += " "; + if(_username != ""){ + command += _username; + command += "@"; + } + command += _hostname; + command += " \"mkdir -p "; + command += string(params[TMPDIR]); + command += "\"" ; + cerr << command.c_str() << endl; + status = system(command.c_str()); + if(status) { + std::ostringstream oss; + oss << status; + std::string ex_mess("Error of connection on remote host ! status = "); + ex_mess += oss.str(); + throw EmulationException(ex_mess.c_str()); + } + + // Second step : copy fileToExecute into + // batch tmp files directory + command = copy_command; + command += string(params[EXECUTABLE]); + command += " "; + if(_username != ""){ + command += _username; + command += "@"; + } + command += _hostname; + command += ":"; + command += string(params[TMPDIR]); + cerr << command.c_str() << endl; + status = system(command.c_str()); + if(status) { + std::ostringstream oss; + oss << status; + std::string ex_mess("Error of connection on remote host ! status = "); + ex_mess += oss.str(); + throw EmulationException(ex_mess.c_str()); + } + + // Third step : copy filesToExportList into + // batch tmp files directory + for(Vit=V.begin(); Vit!=V.end(); Vit++) { + CoupleType cpt = *static_cast< CoupleType * >(*Vit); + Couple inputFile = cpt; + command = copy_command; + command += inputFile.getLocal(); + command += " "; + if(_username != ""){ + command += _username; + command += "@"; + } + command += _hostname; + command += ":"; + command += inputFile.getRemote(); + cerr << command.c_str() << endl; + status = system(command.c_str()); + if(status) { + std::ostringstream oss; + oss << status; + std::string ex_mess("Error of connection on remote host ! status = "); + ex_mess += oss.str(); + throw EmulationException(ex_mess.c_str()); + } + } + + } + + void BatchManager_eClient::importOutputFiles( const Job & job, const string directory ) throw(EmulationException) + { + string command; + int status; + + Parametre params = job.getParametre(); + Versatile V = params[OUTFILE]; + Versatile::iterator Vit; + + for(Vit=V.begin(); Vit!=V.end(); Vit++) { + CoupleType cpt = *static_cast< CoupleType * >(*Vit); + Couple outputFile = cpt; + if( _protocol == "rsh" ) + command = "rcp "; + else if( _protocol == "ssh" ) + command = "scp "; + else + throw EmulationException("Unknown protocol"); + + if (_username != ""){ + command += _username; + command += "@"; + } + command += _hostname; + command += ":"; + command += outputFile.getRemote(); + command += " "; + command += directory; + cerr << command.c_str() << endl; + status = system(command.c_str()); + if(status) + { + // Try to get what we can (logs files) + // throw BatchException("Error of connection on remote host"); + std::string mess("Copy command failed ! status is :"); + ostringstream status_str; + status_str << status; + mess += status_str.str(); + cerr << mess << endl; + } + } + + } + + MpiImpl *BatchManager_eClient::FactoryMpiImpl(string mpiImpl) throw(EmulationException) + { + if(mpiImpl == "lam") + return new MpiImpl_LAM(); + else if(mpiImpl == "mpich1") + return new MpiImpl_MPICH1(); + else if(mpiImpl == "mpich2") + return new MpiImpl_MPICH2(); + else if(mpiImpl == "openmpi") + return new MpiImpl_OPENMPI(); + else if(mpiImpl == "slurm") + return new MpiImpl_SLURM(); + else{ + ostringstream oss; + oss << mpiImpl << " : not yet implemented"; + throw EmulationException(oss.str().c_str()); + } + } + + string BatchManager_eClient::BuildTemporaryFileName() const + { + //build more complex file name to support multiple salome session + char *temp = new char[19]; + strcpy(temp, "/tmp/command"); + strcat(temp, "XXXXXX"); +#ifndef WNT + mkstemp(temp); +#else + char aPID[80]; + itoa(getpid(), aPID, 10); + strcat(temp, aPID); +#endif + + string command(temp); + delete [] temp; + command += ".sh"; + return command; + } + + void BatchManager_eClient::RmTmpFile(std::string & TemporaryFileName) + { + string command = "rm "; + command += TemporaryFileName; + char *temp = strdup(command.c_str()); + int lgthTemp = strlen(temp); + temp[lgthTemp - 3] = '*'; + temp[lgthTemp - 2] = '\0'; + system(temp); + free(temp); + } + +} diff --git a/src/Batch/Batch_BatchManager_eClient.hxx b/src/Batch/Batch_BatchManager_eClient.hxx new file mode 100644 index 000000000..717eae6a3 --- /dev/null +++ b/src/Batch/Batch_BatchManager_eClient.hxx @@ -0,0 +1,73 @@ +// Copyright (C) 2005 OPEN CASCADE, EADS/CCR, LIP6, CEA/DEN, +// CEDRAT, EDF R&D, LEG, PRINCIPIA R&D, BUREAU VERITAS +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License. +// +// This library is distributed in the hope that it will be useful +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// +// See http://www.salome-platform.org/ or email : webmaster.salome@opencascade.com +// +/* + * BatchManager_eLSF.hxx : emulation of client + * + * Auteur : Bernard SECHER - CEA DEN + * Mail : mailto:bernard.secher@cea.fr + * Date : Thu Apr 24 10:17:22 2008 + * Projet : PAL Salome + * + */ + +#ifndef _BATCHMANAGER_eClient_H_ +#define _BATCHMANAGER_eClient_H_ + + +#include "MpiImpl.hxx" +#include "Batch_BatchManager.hxx" + +namespace Batch { + + class Job; + + class EmulationException + { + public: + const std::string msg; + + EmulationException(const std::string m) : msg(m) {} + }; + + class BatchManager_eClient : public BatchManager + { + public: + // Constructeur et destructeur + BatchManager_eClient(const Batch::FactBatchManager * parent, const char* host="localhost", const char* protocol="ssh", const char* mpiImpl="indif"); + virtual ~BatchManager_eClient(); + void importOutputFiles( const Job & job, const std::string directory ) throw(EmulationException); + + protected: + std::string _protocol; // protocol to access _hostname + std::string _username; // username to access _hostname + MpiImpl *_mpiImpl; // Mpi implementation to launch executable in batch script + + std::string BuildTemporaryFileName() const; + void RmTmpFile(std::string & TemporaryFileName); + MpiImpl* FactoryMpiImpl(string mpiImpl) throw(EmulationException); + void exportInputFiles(const Job & job) throw(EmulationException); + + private: + + }; + +} + +#endif diff --git a/src/Batch/Batch_BatchManager_eLSF.cxx b/src/Batch/Batch_BatchManager_eLSF.cxx new file mode 100644 index 000000000..d1c44e60d --- /dev/null +++ b/src/Batch/Batch_BatchManager_eLSF.cxx @@ -0,0 +1,310 @@ +// Copyright (C) 2005 OPEN CASCADE, EADS/CCR, LIP6, CEA/DEN, +// CEDRAT, EDF R&D, LEG, PRINCIPIA R&D, BUREAU VERITAS +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License. +// +// This library is distributed in the hope that it will be useful +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// +// See http://www.salome-platform.org/ or email : webmaster.salome@opencascade.com +// +/* + * BatchManager_eLSF.cxx : emulation of LSF client + * + * Auteur : Bernard SECHER - CEA DEN + * Mail : mailto:bernard.secher@cea.fr + * Date : Thu Apr 24 10:17:22 2008 + * Projet : PAL Salome + * + */ + +#include +#include +#include +#include +#include "Batch_BatchManager_eLSF.hxx" + +namespace Batch { + + BatchManager_eLSF::BatchManager_eLSF(const FactBatchManager * parent, const char * host, const char * protocol, const char * mpiImpl) throw(InvalidArgumentException,ConnexionFailureException) : BatchManager_eClient(parent,host,protocol,mpiImpl) + { + // Nothing to do + } + + // Destructeur + BatchManager_eLSF::~BatchManager_eLSF() + { + // Nothing to do + } + + // Methode pour le controle des jobs : soumet un job au gestionnaire + const JobId BatchManager_eLSF::submitJob(const Job & job) + { + int status; + Parametre params = job.getParametre(); + const std::string dirForTmpFiles = params[TMPDIR]; + const string fileToExecute = params[EXECUTABLE]; + string::size_type p1 = fileToExecute.find_last_of("/"); + string::size_type p2 = fileToExecute.find_last_of("."); + std::string fileNameToExecute = fileToExecute.substr(p1+1,p2-p1-1); + + // export input files on cluster + exportInputFiles(job); + + // build batch script for job + buildBatchScript(job); + + // define name of log file + string logFile="/tmp/logs/"; + logFile += getenv("USER"); + logFile += "/batchSalome_"; + srand ( time(NULL) ); + int ir = rand(); + ostringstream oss; + oss << ir; + logFile += oss.str(); + logFile += ".log"; + + string command; + + // define command to submit batch + command = _protocol; + command += " "; + + if(_username != ""){ + command += _username; + command += "@"; + } + + command += _hostname; + command += " \"cd " ; + command += dirForTmpFiles ; + command += "; bsub < " ; + command += fileNameToExecute ; + command += "_Batch.sh\" > "; + command += logFile; + cerr << command.c_str() << endl; + status = system(command.c_str()); + if(status) + throw EmulationException("Error of connection on remote host"); + + // read id of submitted job in log file + char line[128]; + FILE *fp = fopen(logFile.c_str(),"r"); + fgets( line, 128, fp); + fclose(fp); + + string sline(line); + int p10 = sline.find("<"); + int p20 = sline.find(">"); + string strjob = sline.substr(p10+1,p20-p10-1); + + JobId id(this, strjob); + return id; + } + + // Methode pour le controle des jobs : retire un job du gestionnaire + void BatchManager_eLSF::deleteJob(const JobId & jobid) + { + int status; + int ref; + istringstream iss(jobid.getReference()); + iss >> ref; + + // define command to submit batch + string command; + command = _protocol; + command += " "; + + if (_username != ""){ + command += _username; + command += "@"; + } + + command += _hostname; + command += " \"bkill " ; + command += iss.str(); + command += "\""; + cerr << command.c_str() << endl; + status = system(command.c_str()); + if(status) + throw EmulationException("Error of connection on remote host"); + + cerr << "jobId = " << ref << "killed" << endl; + } + + // Methode pour le controle des jobs : suspend un job en file d'attente + void BatchManager_eLSF::holdJob(const JobId & jobid) + { + throw EmulationException("Not yet implemented"); + } + + // Methode pour le controle des jobs : relache un job suspendu + void BatchManager_eLSF::releaseJob(const JobId & jobid) + { + throw EmulationException("Not yet implemented"); + } + + + // Methode pour le controle des jobs : modifie un job en file d'attente + void BatchManager_eLSF::alterJob(const JobId & jobid, const Parametre & param, const Environnement & env) + { + throw EmulationException("Not yet implemented"); + } + + // Methode pour le controle des jobs : modifie un job en file d'attente + void BatchManager_eLSF::alterJob(const JobId & jobid, const Parametre & param) + { + alterJob(jobid, param, Environnement()); + } + + // Methode pour le controle des jobs : modifie un job en file d'attente + void BatchManager_eLSF::alterJob(const JobId & jobid, const Environnement & env) + { + alterJob(jobid, Parametre(), env); + } + + // Methode pour le controle des jobs : renvoie l'etat du job + JobInfo BatchManager_eLSF::queryJob(const JobId & jobid) + { + int id; + istringstream iss(jobid.getReference()); + iss >> id; + + // define name of log file + string logFile="/tmp/logs/"; + logFile += getenv("USER"); + logFile += "/batchSalome_"; + + srand ( time(NULL) ); + int ir = rand(); + ostringstream oss; + oss << ir; + logFile += oss.str(); + logFile += ".log"; + + string command; + int status; + + // define command to submit batch + command = _protocol; + command += " "; + + if (_username != ""){ + command += _username; + command += "@"; + } + + command += _hostname; + command += " \"bjobs " ; + command += iss.str(); + command += "\" > "; + command += logFile; + cerr << command.c_str() << endl; + status = system(command.c_str()); + if(status) + throw EmulationException("Error of connection on remote host"); + + JobInfo_eLSF ji = JobInfo_eLSF(id,logFile); + return ji; + } + + + + // Methode pour le controle des jobs : teste si un job est present en machine + bool BatchManager_eLSF::isRunning(const JobId & jobid) + { + throw EmulationException("Not yet implemented"); + } + + void BatchManager_eLSF::buildBatchScript(const Job & job) throw(EmulationException) + { + int status; + Parametre params = job.getParametre(); + const int nbproc = params[NBPROC]; + const long edt = params[MAXWALLTIME]; + const long mem = params[MAXRAMSIZE]; + const string workDir = params[WORKDIR]; + const std::string dirForTmpFiles = params[TMPDIR]; + const string fileToExecute = params[EXECUTABLE]; + string::size_type p1 = fileToExecute.find_last_of("/"); + string::size_type p2 = fileToExecute.find_last_of("."); + std::string rootNameToExecute = fileToExecute.substr(p1+1,p2-p1-1); + std::string fileNameToExecute = "~/" + dirForTmpFiles + "/" + string(basename(fileToExecute.c_str())); + + int idx = dirForTmpFiles.find("Batch/"); + std::string filelogtemp = dirForTmpFiles.substr(idx+6, dirForTmpFiles.length()); + + std::string TmpFileName = BuildTemporaryFileName(); + ofstream tempOutputFile; + tempOutputFile.open(TmpFileName.c_str(), ofstream::out ); + + tempOutputFile << "#! /bin/sh -f" << endl ; + if( edt > 0 ) + tempOutputFile << "#BSUB -W " << getWallTime(edt) << endl ; + if( mem > 0 ) + tempOutputFile << "#BSUB -M " << mem*1024 << endl ; + tempOutputFile << "#BSUB -n " << nbproc << endl ; + tempOutputFile << "#BSUB -o runSalome.output.log." << filelogtemp << endl ; + tempOutputFile << "#BSUB -e runSalome.error.log." << filelogtemp << endl ; + if( workDir.size() > 0 ) + tempOutputFile << "cd " << workDir << endl ; + tempOutputFile << _mpiImpl->boot("",nbproc); + tempOutputFile << _mpiImpl->run("",nbproc,fileNameToExecute); + tempOutputFile << _mpiImpl->halt(); + tempOutputFile.flush(); + tempOutputFile.close(); + chmod(TmpFileName.c_str(), 0x1ED); + cerr << TmpFileName.c_str() << endl; + + string command; + if( _protocol == "rsh" ) + command = "rcp "; + else if( _protocol == "ssh" ) + command = "scp "; + else + throw EmulationException("Unknown protocol"); + command += TmpFileName; + command += " "; + if(_username != ""){ + command += _username; + command += "@"; + } + command += _hostname; + command += ":"; + command += dirForTmpFiles ; + command += "/" ; + command += rootNameToExecute ; + command += "_Batch.sh" ; + cerr << command.c_str() << endl; + status = system(command.c_str()); + if(status) + throw EmulationException("Error of connection on remote host"); + + RmTmpFile(TmpFileName); + + } + + std::string BatchManager_eLSF::getWallTime(const long edt) + { + long h, m; + h = edt / 60; + m = edt - h*60; + ostringstream oss; + if( m >= 10 ) + oss << h << ":" << m; + else + oss << h << ":0" << m; + return oss.str(); + } + +} diff --git a/src/Batch/Batch_BatchManager_eLSF.hxx b/src/Batch/Batch_BatchManager_eLSF.hxx new file mode 100644 index 000000000..00d79a4fa --- /dev/null +++ b/src/Batch/Batch_BatchManager_eLSF.hxx @@ -0,0 +1,93 @@ +// Copyright (C) 2005 OPEN CASCADE, EADS/CCR, LIP6, CEA/DEN, +// CEDRAT, EDF R&D, LEG, PRINCIPIA R&D, BUREAU VERITAS +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License. +// +// This library is distributed in the hope that it will be useful +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// +// See http://www.salome-platform.org/ or email : webmaster.salome@opencascade.com +// +/* + * BatchManager_eLSF.hxx : emulation of LSF client + * + * Auteur : Bernard SECHER - CEA DEN + * Mail : mailto:bernard.secher@cea.fr + * Date : Thu Apr 24 10:17:22 2008 + * Projet : PAL Salome + * + */ + +#ifndef _BATCHMANAGER_eLSF_H_ +#define _BATCHMANAGER_eLSF_H_ + + +#include "Batch_JobId.hxx" +#include "Batch_JobInfo.hxx" +#include "Batch_JobInfo_eLSF.hxx" +#include "Batch_InvalidArgumentException.hxx" +#include "Batch_ConnexionFailureException.hxx" +#include "Batch_APIInternalFailureException.hxx" +#include "Batch_NotYetImplementedException.hxx" +#include "Batch_BatchManager.hxx" +#include "Batch_BatchManager_eClient.hxx" + +namespace Batch { + + class Job; + class JobId; + class JobInfo; + class FactBatchManager; + + class BatchManager_eLSF : public BatchManager_eClient + { + public: + // Constructeur et destructeur + BatchManager_eLSF(const FactBatchManager * parent, const char * host="localhost", const char * protocol="ssh", const char * mpiImpl="indif") throw(InvalidArgumentException,ConnexionFailureException); // connexion a la machine host + virtual ~BatchManager_eLSF(); + + // Recupere le nom du serveur par defaut + // static string BatchManager_LSF::getDefaultServer(); + + // Methodes pour le controle des jobs + virtual const JobId submitJob(const Job & job); // soumet un job au gestionnaire + virtual void deleteJob(const JobId & jobid); // retire un job du gestionnaire + virtual void holdJob(const JobId & jobid); // suspend un job en file d'attente + virtual void releaseJob(const JobId & jobid); // relache un job suspendu + virtual void alterJob(const JobId & jobid, const Parametre & param, const Environnement & env); // modifie un job en file d'attente + virtual void alterJob(const JobId & jobid, const Parametre & param); // modifie un job en file d'attente + virtual void alterJob(const JobId & jobid, const Environnement & env); // modifie un job en file d'attente + virtual JobInfo queryJob(const JobId & jobid); // renvoie l'etat du job + virtual bool isRunning(const JobId & jobid); // teste si un job est present en machine + + virtual void setParametre(const JobId & jobid, const Parametre & param) { return alterJob(jobid, param); } // modifie un job en file d'attente + virtual void setEnvironnement(const JobId & jobid, const Environnement & env) { return alterJob(jobid, env); } // modifie un job en file d'attente + + + protected: + void buildBatchScript(const Job & job) throw(EmulationException); + std::string getWallTime(const long edt); + + private: + +#ifdef SWIG + public: + // Recupere le l'identifiant d'un job deja soumis au BatchManager + //virtual const JobId getJobIdByReference(const string & ref) { return BatchManager::getJobIdByReference(ref); } + virtual const JobId getJobIdByReference(const char * ref) { return BatchManager::getJobIdByReference(ref); } +#endif + + }; + +} + +#endif diff --git a/src/Batch/Batch_BatchManager_ePBS.cxx b/src/Batch/Batch_BatchManager_ePBS.cxx new file mode 100644 index 000000000..999025cfb --- /dev/null +++ b/src/Batch/Batch_BatchManager_ePBS.cxx @@ -0,0 +1,296 @@ +// Copyright (C) 2005 OPEN CASCADE, EADS/CCR, LIP6, CEA/DEN, +// CEDRAT, EDF R&D, LEG, PRINCIPIA R&D, BUREAU VERITAS +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License. +// +// This library is distributed in the hope that it will be useful +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// +// See http://www.salome-platform.org/ or email : webmaster.salome@opencascade.com +// +/* + * BatchManager_ePBS.cxx : emulation of PBS client + * + * Auteur : Bernard SECHER - CEA DEN + * Mail : mailto:bernard.secher@cea.fr + * Date : Thu Apr 24 10:17:22 2008 + * Projet : PAL Salome + * + */ + +#include +#include +#include +#include +#include "Batch_BatchManager_ePBS.hxx" + +namespace Batch { + + BatchManager_ePBS::BatchManager_ePBS(const FactBatchManager * parent, const char * host, const char * protocol, const char * mpiImpl) throw(InvalidArgumentException,ConnexionFailureException) : BatchManager_eClient(parent,host,protocol,mpiImpl) + { + // Nothing to do + } + + // Destructeur + BatchManager_ePBS::~BatchManager_ePBS() + { + // Nothing to do + } + + // Methode pour le controle des jobs : soumet un job au gestionnaire + const JobId BatchManager_ePBS::submitJob(const Job & job) + { + int status; + Parametre params = job.getParametre(); + const std::string dirForTmpFiles = params[TMPDIR]; + const string fileToExecute = params[EXECUTABLE]; + string::size_type p1 = fileToExecute.find_last_of("/"); + string::size_type p2 = fileToExecute.find_last_of("."); + std::string fileNameToExecute = fileToExecute.substr(p1+1,p2-p1-1); + + // export input files on cluster + exportInputFiles(job); + + // build batch script for job + buildBatchScript(job); + + // define name of log file + string logFile="/tmp/logs/"; + logFile += getenv("USER"); + logFile += "/batchSalome_"; + srand ( time(NULL) ); + int ir = rand(); + ostringstream oss; + oss << ir; + logFile += oss.str(); + logFile += ".log"; + + string command; + + // define command to submit batch + command = _protocol; + command += " "; + + if(_username != ""){ + command += _username; + command += "@"; + } + + command += _hostname; + command += " \"cd " ; + command += dirForTmpFiles ; + command += "; qsub " ; + command += fileNameToExecute ; + command += "_Batch.sh\" > "; + command += logFile; + cerr << command.c_str() << endl; + status = system(command.c_str()); + if(status) + throw EmulationException("Error of connection on remote host"); + + // read id of submitted job in log file + char line[128]; + FILE *fp = fopen(logFile.c_str(),"r"); + fgets( line, 128, fp); + fclose(fp); + + string sline(line); + int pos = sline.find("."); + string strjob; + if(pos == string::npos) + strjob = sline; + else + strjob = sline.substr(0,pos); + + JobId id(this, strjob); + return id; + } + + // Methode pour le controle des jobs : retire un job du gestionnaire + void BatchManager_ePBS::deleteJob(const JobId & jobid) + { + int status; + int ref; + istringstream iss(jobid.getReference()); + iss >> ref; + + // define command to submit batch + string command; + command = _protocol; + command += " "; + + if (_username != ""){ + command += _username; + command += "@"; + } + + command += _hostname; + command += " \"qdel " ; + command += iss.str(); + command += "\""; + cerr << command.c_str() << endl; + status = system(command.c_str()); + if(status) + throw EmulationException("Error of connection on remote host"); + + cerr << "jobId = " << ref << "killed" << endl; + } + + // Methode pour le controle des jobs : suspend un job en file d'attente + void BatchManager_ePBS::holdJob(const JobId & jobid) + { + throw EmulationException("Not yet implemented"); + } + + // Methode pour le controle des jobs : relache un job suspendu + void BatchManager_ePBS::releaseJob(const JobId & jobid) + { + throw EmulationException("Not yet implemented"); + } + + + // Methode pour le controle des jobs : modifie un job en file d'attente + void BatchManager_ePBS::alterJob(const JobId & jobid, const Parametre & param, const Environnement & env) + { + throw EmulationException("Not yet implemented"); + } + + // Methode pour le controle des jobs : modifie un job en file d'attente + void BatchManager_ePBS::alterJob(const JobId & jobid, const Parametre & param) + { + alterJob(jobid, param, Environnement()); + } + + // Methode pour le controle des jobs : modifie un job en file d'attente + void BatchManager_ePBS::alterJob(const JobId & jobid, const Environnement & env) + { + alterJob(jobid, Parametre(), env); + } + + // Methode pour le controle des jobs : renvoie l'etat du job + JobInfo BatchManager_ePBS::queryJob(const JobId & jobid) + { + int id; + istringstream iss(jobid.getReference()); + iss >> id; + + // define name of log file + string logFile="/tmp/logs/"; + logFile += getenv("USER"); + logFile += "/batchSalome_"; + + ostringstream oss; + oss << this << "_" << id; + logFile += oss.str(); + logFile += ".log"; + + string command; + int status; + + // define command to submit batch + command = _protocol; + command += " "; + + if (_username != ""){ + command += _username; + command += "@"; + } + + command += _hostname; + command += " \"qstat -f " ; + command += iss.str(); + command += "\" > "; + command += logFile; + cerr << command.c_str() << endl; + status = system(command.c_str()); + if(status && status != 153 && status != 256*153) + throw EmulationException("Error of connection on remote host"); + + JobInfo_ePBS ji = JobInfo_ePBS(id,logFile); + return ji; + } + + // Methode pour le controle des jobs : teste si un job est present en machine + bool BatchManager_ePBS::isRunning(const JobId & jobid) + { + throw EmulationException("Not yet implemented"); + } + + void BatchManager_ePBS::buildBatchScript(const Job & job) throw(EmulationException) + { + int status; + Parametre params = job.getParametre(); + const long nbproc = params[NBPROC]; + const long edt = params[MAXWALLTIME]; + const long mem = params[MAXRAMSIZE]; + const string workDir = params[WORKDIR]; + const std::string dirForTmpFiles = params[TMPDIR]; + const string fileToExecute = params[EXECUTABLE]; + const string home = params[HOMEDIR]; + string::size_type p1 = fileToExecute.find_last_of("/"); + string::size_type p2 = fileToExecute.find_last_of("."); + std::string rootNameToExecute = fileToExecute.substr(p1+1,p2-p1-1); + std::string fileNameToExecute = "~/" + dirForTmpFiles + "/" + string(basename(fileToExecute.c_str())); + + int idx = dirForTmpFiles.find("Batch/"); + std::string filelogtemp = dirForTmpFiles.substr(idx+6, dirForTmpFiles.length()); + + std::string TmpFileName = BuildTemporaryFileName(); + ofstream tempOutputFile; + tempOutputFile.open(TmpFileName.c_str(), ofstream::out ); + + tempOutputFile << "#! /bin/sh -f" << endl; + if( edt > 0 ) + tempOutputFile << "#PBS -l walltime=" << edt*60 << endl ; + if( mem > 0 ) + tempOutputFile << "#PBS -l mem=" << mem << "mb" << endl ; + tempOutputFile << "#PBS -o " << home << "/" << dirForTmpFiles << "/runSalome.output.log." << filelogtemp << endl ; + tempOutputFile << "#PBS -e " << home << "/" << dirForTmpFiles << "/runSalome.error.log." << filelogtemp << endl ; + if( workDir.size() > 0 ) + tempOutputFile << "cd " << workDir << endl ; + tempOutputFile << _mpiImpl->boot("${PBS_NODEFILE}",nbproc); + tempOutputFile << _mpiImpl->run("${PBS_NODEFILE}",nbproc,fileNameToExecute); + tempOutputFile << _mpiImpl->halt(); + tempOutputFile.flush(); + tempOutputFile.close(); + chmod(TmpFileName.c_str(), 0x1ED); + cerr << TmpFileName.c_str() << endl; + + string command; + if( _protocol == "rsh" ) + command = "rcp "; + else if( _protocol == "ssh" ) + command = "scp "; + else + throw EmulationException("Unknown protocol"); + command += TmpFileName; + command += " "; + if(_username != ""){ + command += _username; + command += "@"; + } + command += _hostname; + command += ":"; + command += dirForTmpFiles ; + command += "/" ; + command += rootNameToExecute ; + command += "_Batch.sh" ; + cerr << command.c_str() << endl; + status = system(command.c_str()); + if(status) + throw EmulationException("Error of connection on remote host"); + + RmTmpFile(TmpFileName); + + } + +} diff --git a/src/Batch/Batch_BatchManager_ePBS.hxx b/src/Batch/Batch_BatchManager_ePBS.hxx new file mode 100644 index 000000000..42f4b1b69 --- /dev/null +++ b/src/Batch/Batch_BatchManager_ePBS.hxx @@ -0,0 +1,91 @@ +// Copyright (C) 2005 OPEN CASCADE, EADS/CCR, LIP6, CEA/DEN, +// CEDRAT, EDF R&D, LEG, PRINCIPIA R&D, BUREAU VERITAS +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License. +// +// This library is distributed in the hope that it will be useful +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// +// See http://www.salome-platform.org/ or email : webmaster.salome@opencascade.com +// +/* + * BatchManager_ePBS.hxx : emulation of PBS client + * + * Auteur : Bernard SECHER - CEA DEN + * Mail : mailto:bernard.secher@cea.fr + * Date : Thu Apr 24 10:17:22 2008 + * Projet : PAL Salome + * + */ + +#ifndef _BATCHMANAGER_eLSF_H_ +#define _BATCHMANAGER_eLSF_H_ + +#include "Batch_JobId.hxx" +#include "Batch_JobInfo.hxx" +#include "Batch_JobInfo_ePBS.hxx" +#include "Batch_InvalidArgumentException.hxx" +#include "Batch_ConnexionFailureException.hxx" +#include "Batch_APIInternalFailureException.hxx" +#include "Batch_NotYetImplementedException.hxx" +#include "Batch_BatchManager.hxx" +#include "Batch_BatchManager_eClient.hxx" + +namespace Batch { + + class Job; + class JobId; + class JobInfo; + class FactBatchManager; + + class BatchManager_ePBS : public BatchManager_eClient + { + public: + // Constructeur et destructeur + BatchManager_ePBS(const FactBatchManager * parent, const char * host="localhost", const char * protocol="ssh", const char * mpiImpl="indif") throw(InvalidArgumentException,ConnexionFailureException); // connexion a la machine host + virtual ~BatchManager_ePBS(); + + // Recupere le nom du serveur par defaut + // static string BatchManager_LSF::getDefaultServer(); + + // Methodes pour le controle des jobs + virtual const JobId submitJob(const Job & job); // soumet un job au gestionnaire + virtual void deleteJob(const JobId & jobid); // retire un job du gestionnaire + virtual void holdJob(const JobId & jobid); // suspend un job en file d'attente + virtual void releaseJob(const JobId & jobid); // relache un job suspendu + virtual void alterJob(const JobId & jobid, const Parametre & param, const Environnement & env); // modifie un job en file d'attente + virtual void alterJob(const JobId & jobid, const Parametre & param); // modifie un job en file d'attente + virtual void alterJob(const JobId & jobid, const Environnement & env); // modifie un job en file d'attente + virtual JobInfo queryJob(const JobId & jobid); // renvoie l'etat du job + virtual bool isRunning(const JobId & jobid); // teste si un job est present en machine + + virtual void setParametre(const JobId & jobid, const Parametre & param) { return alterJob(jobid, param); } // modifie un job en file d'attente + virtual void setEnvironnement(const JobId & jobid, const Environnement & env) { return alterJob(jobid, env); } // modifie un job en file d'attente + + + protected: + void buildBatchScript(const Job & job) throw(EmulationException); + + private: + +#ifdef SWIG + public: + // Recupere le l'identifiant d'un job deja soumis au BatchManager + //virtual const JobId getJobIdByReference(const string & ref) { return BatchManager::getJobIdByReference(ref); } + virtual const JobId getJobIdByReference(const char * ref) { return BatchManager::getJobIdByReference(ref); } +#endif + + }; + +} + +#endif diff --git a/src/Batch/Batch_FactBatchManager_eClient.cxx b/src/Batch/Batch_FactBatchManager_eClient.cxx new file mode 100644 index 000000000..6673879de --- /dev/null +++ b/src/Batch/Batch_FactBatchManager_eClient.cxx @@ -0,0 +1,48 @@ +// Copyright (C) 2005 OPEN CASCADE, EADS/CCR, LIP6, CEA/DEN, +// CEDRAT, EDF R&D, LEG, PRINCIPIA R&D, BUREAU VERITAS +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License. +// +// This library is distributed in the hope that it will be useful +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// +// See http://www.salome-platform.org/ or email : webmaster.salome@opencascade.com +// +/* + * FactBatchManager_eClient.cxx : emulation of client + * + * Auteur : Bernard SECHER - CEA DEN + * Mail : mailto:bernard.secher@cea.fr + * Date : Thu Apr 24 10:17:22 2008 + * Projet : PAL Salome + * + */ + +#include +#include +#include "Batch_FactBatchManager_eClient.hxx" +using namespace std; + +namespace Batch { + + // Constructeur + FactBatchManager_eClient::FactBatchManager_eClient(const string & _t) : FactBatchManager(_t) + { + } + + // Destructeur + FactBatchManager_eClient::~FactBatchManager_eClient() + { + // Nothing to do + } + +} diff --git a/src/Launcher/BatchLight_BatchManager_SLURM.hxx b/src/Batch/Batch_FactBatchManager_eClient.hxx similarity index 53% rename from src/Launcher/BatchLight_BatchManager_SLURM.hxx rename to src/Batch/Batch_FactBatchManager_eClient.hxx index 6024b28de..616a6626d 100644 --- a/src/Launcher/BatchLight_BatchManager_SLURM.hxx +++ b/src/Batch/Batch_FactBatchManager_eClient.hxx @@ -18,40 +18,36 @@ // See http://www.salome-platform.org/ or email : webmaster.salome@opencascade.com // /* - * BatchManager.hxx : + * FactBatchManager_eClient.hxx : emulation of client * - * Auteur : Bernard SECHER - CEA/DEN - * Date : Juillet 2007 - * Projet : SALOME + * Auteur : Bernard SECHER - CEA DEN + * Mail : mailto:bernard.secher@cea.fr + * Date : Thu Apr 24 10:17:22 2008 + * Projet : PAL Salome * */ -#ifndef _BL_BATCHMANAGER_SLURM_H_ -#define _BL_BATCHMANAGER_SLURM_H_ +#ifndef _FACTBATCHMANAGER_eClient_H_ +#define _FACTBATCHMANAGER_eClient_H_ #include -#include "Utils_SALOME_Exception.hxx" -#include "BatchLight_BatchManager.hxx" +#include +#include "Batch_FactBatchManager.hxx" -namespace BatchLight { +namespace Batch { + + class BatchManager_eClient; - class Job; - - class BatchManager_SLURM : public BatchManager + class FactBatchManager_eClient : public FactBatchManager { public: // Constructeur et destructeur - BatchManager_SLURM(const batchParams& p) throw(SALOME_Exception); // connexion a la machine host - virtual ~BatchManager_SLURM(); + FactBatchManager_eClient(const std::string & type); + virtual ~FactBatchManager_eClient(); - // Methodes pour le controle des jobs : virtuelles pures - void deleteJob(const int & jobid); // retire un job du gestionnaire - std::string queryJob(const int & jobid); // renvoie l'etat du job + virtual Batch::BatchManager_eClient * operator() (const char * hostname,const char * protocol, const char * mpi) const = 0; protected: - void buildSalomeCouplingScript(BatchLight::Job* job) throw(SALOME_Exception); - void buildSalomeBatchScript(BatchLight::Job* job) throw(SALOME_Exception); - int submit(BatchLight::Job* job) throw(SALOME_Exception); private: diff --git a/src/Batch/Batch_FactBatchManager_eLSF.cxx b/src/Batch/Batch_FactBatchManager_eLSF.cxx new file mode 100644 index 000000000..227bffa32 --- /dev/null +++ b/src/Batch/Batch_FactBatchManager_eLSF.cxx @@ -0,0 +1,63 @@ +// Copyright (C) 2005 OPEN CASCADE, EADS/CCR, LIP6, CEA/DEN, +// CEDRAT, EDF R&D, LEG, PRINCIPIA R&D, BUREAU VERITAS +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License. +// +// This library is distributed in the hope that it will be useful +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// +// See http://www.salome-platform.org/ or email : webmaster.salome@opencascade.com +// +/* + * FactBatchManager_eLSF.cxx : + * + * Auteur : Bernard SECHER - CEA DEN + * Date : Avril 2008 + * Projet : PAL Salome + * + */ + +#include +#include "Batch_BatchManager_eLSF.hxx" +#include "Batch_FactBatchManager_eLSF.hxx" +//#include "utilities.h" + +namespace Batch { + + static FactBatchManager_eLSF sFBM_eLSF; + + // Constructeur + FactBatchManager_eLSF::FactBatchManager_eLSF() : FactBatchManager_eClient("eLSF") + { + // Nothing to do + } + + // Destructeur + FactBatchManager_eLSF::~FactBatchManager_eLSF() + { + // Nothing to do + } + + // Functor + BatchManager * FactBatchManager_eLSF::operator() (const char * hostname) const + { + // MESSAGE("Building new BatchManager_LSF on host '" << hostname << "'"); + return new BatchManager_eLSF(this, hostname); + } + + BatchManager_eClient * FactBatchManager_eLSF::operator() (const char * hostname, const char * protocol, const char * mpiImpl) const + { + // MESSAGE("Building new BatchManager_LSF on host '" << hostname << "'"); + return new BatchManager_eLSF(this, hostname, protocol, mpiImpl); + } + +} diff --git a/src/Batch/Batch_FactBatchManager_eLSF.hxx b/src/Batch/Batch_FactBatchManager_eLSF.hxx new file mode 100644 index 000000000..e1660aaaa --- /dev/null +++ b/src/Batch/Batch_FactBatchManager_eLSF.hxx @@ -0,0 +1,60 @@ +// Copyright (C) 2005 OPEN CASCADE, EADS/CCR, LIP6, CEA/DEN, +// CEDRAT, EDF R&D, LEG, PRINCIPIA R&D, BUREAU VERITAS +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License. +// +// This library is distributed in the hope that it will be useful +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// +// See http://www.salome-platform.org/ or email : webmaster.salome@opencascade.com +// +/* + * FactBatchManager_eLSF.hxx : + * + * Auteur : Bernard SECHER : CEA DEN + * Date : Avril 2008 + * Projet : PAL Salome + * + */ + +#ifndef _FACTBATCHMANAGER_eLSF_H_ +#define _FACTBATCHMANAGER_eLSF_H_ + +using namespace std; +#include +#include +#include "Batch_BatchManager_eClient.hxx" +#include "Batch_FactBatchManager_eClient.hxx" + +namespace Batch { + + class BatchManager_eLSF; + + class FactBatchManager_eLSF : public FactBatchManager_eClient + { + public: + // Constructeur et destructeur + FactBatchManager_eLSF(); + virtual ~FactBatchManager_eLSF(); + + virtual BatchManager * operator() (const char * hostname) const; + virtual BatchManager_eClient * operator() (const char * hostname, const char * protocol, const char * mpiImpl) const; + + protected: + + private: + + }; + +} + +#endif diff --git a/src/Batch/Batch_FactBatchManager_ePBS.cxx b/src/Batch/Batch_FactBatchManager_ePBS.cxx new file mode 100644 index 000000000..3bcbda530 --- /dev/null +++ b/src/Batch/Batch_FactBatchManager_ePBS.cxx @@ -0,0 +1,64 @@ +// Copyright (C) 2005 OPEN CASCADE, EADS/CCR, LIP6, CEA/DEN, +// CEDRAT, EDF R&D, LEG, PRINCIPIA R&D, BUREAU VERITAS +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License. +// +// This library is distributed in the hope that it will be useful +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// +// See http://www.salome-platform.org/ or email : webmaster.salome@opencascade.com +// +/* + * FactBatchManager_ePBS.cxx : + * + * Auteur : Bernard SECHER - CEA DEN + * Date : Avril 2008 + * Projet : PAL Salome + * + */ + +#include +#include "Batch_BatchManager_ePBS.hxx" +#include "Batch_FactBatchManager_ePBS.hxx" +//#include "utilities.h" + +namespace Batch { + + static FactBatchManager_ePBS sFBM_ePBS; + + // Constructeur + FactBatchManager_ePBS::FactBatchManager_ePBS() : FactBatchManager_eClient("ePBS") + { + // Nothing to do + } + + // Destructeur + FactBatchManager_ePBS::~FactBatchManager_ePBS() + { + // Nothing to do + } + + // Functor + BatchManager * FactBatchManager_ePBS::operator() (const char * hostname) const + { + // MESSAGE("Building new BatchManager_PBS on host '" << hostname << "'"); + return new BatchManager_ePBS(this, hostname); + } + + BatchManager_eClient * FactBatchManager_ePBS::operator() (const char * hostname, const char * protocol, const char * mpiImpl) const + { + // MESSAGE("Building new BatchManager_PBS on host '" << hostname << "'"); + return new BatchManager_ePBS(this, hostname, protocol, mpiImpl); + } + + +} diff --git a/src/Batch/Batch_FactBatchManager_ePBS.hxx b/src/Batch/Batch_FactBatchManager_ePBS.hxx new file mode 100644 index 000000000..69fdf322a --- /dev/null +++ b/src/Batch/Batch_FactBatchManager_ePBS.hxx @@ -0,0 +1,60 @@ +// Copyright (C) 2005 OPEN CASCADE, EADS/CCR, LIP6, CEA/DEN, +// CEDRAT, EDF R&D, LEG, PRINCIPIA R&D, BUREAU VERITAS +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License. +// +// This library is distributed in the hope that it will be useful +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// +// See http://www.salome-platform.org/ or email : webmaster.salome@opencascade.com +// +/* + * FactBatchManager_ePBS.hxx : + * + * Auteur : Bernard SECHER : CEA DEN + * Date : Avril 2008 + * Projet : PAL Salome + * + */ + +#ifndef _FACTBATCHMANAGER_ePBS_H_ +#define _FACTBATCHMANAGER_ePBS_H_ + +using namespace std; +#include +#include +#include "Batch_BatchManager_eClient.hxx" +#include "Batch_FactBatchManager_eClient.hxx" + +namespace Batch { + + class BatchManager_ePBS; + + class FactBatchManager_ePBS : public FactBatchManager_eClient + { + public: + // Constructeur et destructeur + FactBatchManager_ePBS(); + virtual ~FactBatchManager_ePBS(); + + virtual BatchManager * operator() (const char * hostname) const; + virtual BatchManager_eClient * operator() (const char * hostname, const char * protocol, const char * mpiImpl) const; + + protected: + + private: + + }; + +} + +#endif diff --git a/src/Batch/Batch_JobInfo_eLSF.cxx b/src/Batch/Batch_JobInfo_eLSF.cxx new file mode 100644 index 000000000..177f2eb06 --- /dev/null +++ b/src/Batch/Batch_JobInfo_eLSF.cxx @@ -0,0 +1,103 @@ +// Copyright (C) 2005 OPEN CASCADE, EADS/CCR, LIP6, CEA/DEN, +// CEDRAT, EDF R&D, LEG, PRINCIPIA R&D, BUREAU VERITAS +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License. +// +// This library is distributed in the hope that it will be useful +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// +// See http://www.salome-platform.org/ or email : webmaster.salome@opencascade.com +// +/* + * JobInfo_eLSF.cxx : emulation of LSF client + * + * Auteur : Bernard SECHER - CEA DEN + * Mail : mailto:bernard.secher@cea.fr + * Date : Thu Apr 24 10:17:22 2008 + * Projet : PAL Salome + * + */ + +#include +#include +#include +#include +#include "Batch_Parametre.hxx" +#include "Batch_Environnement.hxx" +#include "Batch_RunTimeException.hxx" +#include "Batch_APIInternalFailureException.hxx" +#include "Batch_JobInfo_eLSF.hxx" + +namespace Batch { + + + + // Constructeurs + JobInfo_eLSF::JobInfo_eLSF(int id, string logFile) : JobInfo() + { + // On remplit les membres _param et _env + ostringstream oss; + oss << id; + _param[ID] = oss.str(); + + // read status of job in log file + char line[128]; + ifstream fp(logFile.c_str(),ios::in); + fp.getline(line,80,'\n'); + + string sjobid, username, status; + fp >> sjobid; + fp >> username; + fp >> status; + + _param[STATE] = status; + + if( status.find("RUN") != string::npos) + _running = true; + + } + + // Teste si un job est present en machine + bool JobInfo_eLSF::isRunning() const + { + return _running; + } + + + // Destructeur + JobInfo_eLSF::~JobInfo_eLSF() + { + // Nothing to do + } + + // Convertit une date HH:MM:SS en secondes + long JobInfo_eLSF::HMStoLong(const string & s) + { + long hour, min, sec; + + sscanf( s.c_str(), "%ld:%ld:%ld", &hour, &min, &sec); + return ( ( ( hour * 60L ) + min ) * 60L ) + sec; + } + + // Methode pour l'interfacage avec Python (SWIG) : affichage en Python + string JobInfo_eLSF::__str__() const + { + ostringstream sst; + sst << " +#include +#include +#include +#include "Batch_Parametre.hxx" +#include "Batch_Environnement.hxx" +#include "Batch_RunTimeException.hxx" +#include "Batch_APIInternalFailureException.hxx" +#include "Batch_JobInfo_ePBS.hxx" + +namespace Batch { + + + + // Constructeurs + JobInfo_ePBS::JobInfo_ePBS(int id, string logFile) : JobInfo() + { + // On remplit les membres _param et _env + ostringstream oss; + oss << id; + _param[ID] = oss.str(); + + // read of log file + char line[128]; + ifstream fp(logFile.c_str(),ios::in); + + string status; + string sline; + int pos = string::npos; + while( (pos == string::npos) && fp.getline(line,80,'\n') ){ + sline = string(line); + pos = sline.find("job_state"); + }; + + if(pos!=string::npos){ + istringstream iss(sline); + iss >> status; + iss >> status; + iss >> status; + } + else + status = "U"; + + _param[STATE] = status; + + if( status.find("R") != string::npos) + _running = true; + + } + + // Teste si un job est present en machine + bool JobInfo_ePBS::isRunning() const + { + return _running; + } + + + // Destructeur + JobInfo_ePBS::~JobInfo_ePBS() + { + // Nothing to do + } + + // Convertit une date HH:MM:SS en secondes + long JobInfo_ePBS::HMStoLong(const string & s) + { + long hour, min, sec; + + sscanf( s.c_str(), "%ld:%ld:%ld", &hour, &min, &sec); + return ( ( ( hour * 60L ) + min ) * 60L ) + sec; + } + + // Methode pour l'interfacage avec Python (SWIG) : affichage en Python + string JobInfo_ePBS::__str__() const + { + ostringstream sst; + sst << " TypeMap; // map interne servant a controler le type de la valeur associee a chaque clef @@ -151,5 +152,6 @@ def_extern_MapKey(USEDRAMSIZE); def_extern_MapKey(USEDWALLTIME); def_extern_MapKey(USER); def_extern_MapKey(WORKDIR); +def_extern_MapKey(HOMEDIR); #endif diff --git a/src/Batch/Makefile.am b/src/Batch/Makefile.am index 0ec62e54c..b936e299a 100644 --- a/src/Batch/Makefile.am +++ b/src/Batch/Makefile.am @@ -59,7 +59,16 @@ LIB_INCLUDES = \ Batch_PyVersatile.hxx \ Batch_RunTimeException.hxx \ Batch_StringType.hxx \ - Batch_TypeMismatchException.hxx + Batch_TypeMismatchException.hxx \ + Batch_BatchManager_eClient.hxx \ + Batch_FactBatchManager_eClient.hxx \ + Batch_BatchManager_eLSF.hxx \ + Batch_FactBatchManager_eLSF.hxx \ + Batch_JobInfo_eLSF.hxx \ + Batch_BatchManager_ePBS.hxx \ + Batch_FactBatchManager_ePBS.hxx \ + Batch_JobInfo_ePBS.hxx \ + MpiImpl.hxx LIB_SRC = \ @@ -91,7 +100,16 @@ LIB_SRC = \ Batch_PyVersatile.cxx \ Batch_RunTimeException.cxx \ Batch_StringType.cxx \ - Batch_TypeMismatchException.cxx + Batch_TypeMismatchException.cxx \ + Batch_BatchManager_eClient.cxx \ + Batch_FactBatchManager_eClient.cxx \ + Batch_BatchManager_eLSF.cxx \ + Batch_FactBatchManager_eLSF.cxx \ + Batch_JobInfo_eLSF.cxx \ + Batch_BatchManager_ePBS.cxx \ + Batch_FactBatchManager_ePBS.cxx \ + Batch_JobInfo_ePBS.cxx \ + MpiImpl.cxx LIB_CPPFLAGS = \ diff --git a/src/Launcher/MpiImpl.cxx b/src/Batch/MpiImpl.cxx similarity index 84% rename from src/Launcher/MpiImpl.cxx rename to src/Batch/MpiImpl.cxx index 036018b1e..52fab7a8d 100644 --- a/src/Launcher/MpiImpl.cxx +++ b/src/Batch/MpiImpl.cxx @@ -29,7 +29,6 @@ #include #include #include -#include "utilities.h" #include "MpiImpl.hxx" using namespace std; @@ -37,13 +36,11 @@ using namespace std; // Constructor MpiImpl::MpiImpl() { - MESSAGE("MpiImpl constructor"); } // Destructor MpiImpl::~MpiImpl() { - MESSAGE("MpiImpl destructor"); } // lam implementation @@ -55,7 +52,6 @@ MpiImpl_LAM::MpiImpl_LAM() : MpiImpl() // Destructor MpiImpl_LAM::~MpiImpl_LAM() { - MESSAGE("MpiImpl_LAM destructor"); } string MpiImpl_LAM::size() @@ -98,17 +94,16 @@ MpiImpl_MPICH1::MpiImpl_MPICH1() : MpiImpl() // Destructor MpiImpl_MPICH1::~MpiImpl_MPICH1() { - MESSAGE("MpiImpl_MPICH1 destructor"); } string MpiImpl_MPICH1::size() { - throw SALOME_Exception("mpich1 doesn't work with this batch system to submit salome session"); + throw MpiImplException("mpich1 doesn't work with this batch system to submit salome session"); } string MpiImpl_MPICH1::rank() { - throw SALOME_Exception("mpich1 doesn't work with this batch system to submit salome session"); + throw MpiImplException("mpich1 doesn't work with this batch system to submit salome session"); } string MpiImpl_MPICH1::boot(const string machinefile, const unsigned int nbnodes) @@ -137,7 +132,6 @@ MpiImpl_MPICH2::MpiImpl_MPICH2() : MpiImpl() // Destructor MpiImpl_MPICH2::~MpiImpl_MPICH2() { - MESSAGE("MpiImpl_MPICH2 destructor"); } string MpiImpl_MPICH2::size() @@ -180,7 +174,6 @@ MpiImpl_OPENMPI::MpiImpl_OPENMPI() : MpiImpl() // Destructor MpiImpl_OPENMPI::~MpiImpl_OPENMPI() { - MESSAGE("MpiImpl_OPENMPI destructor"); } string MpiImpl_OPENMPI::size() @@ -210,3 +203,41 @@ string MpiImpl_OPENMPI::halt() return ""; } +// slurm implementation +// Constructor +MpiImpl_SLURM::MpiImpl_SLURM() : MpiImpl() +{ +} + +// Destructor +MpiImpl_SLURM::~MpiImpl_SLURM() +{ +} + +string MpiImpl_SLURM::size() +{ + return "${SLURM_NPROCS}"; +} + +string MpiImpl_SLURM::rank() +{ + return "${SLURM_PROCID}"; +} + +string MpiImpl_SLURM::boot(const string machinefile, const unsigned int nbnodes) +{ + return ""; +} + +string MpiImpl_SLURM::run(const string machinefile, const unsigned int nbproc, const string fileNameToExecute) +{ + ostringstream oss; + oss << "srun " << fileNameToExecute << endl; + return oss.str(); +} + +string MpiImpl_SLURM::halt() +{ + return ""; +} + diff --git a/src/Launcher/MpiImpl.hxx b/src/Batch/MpiImpl.hxx similarity index 85% rename from src/Launcher/MpiImpl.hxx rename to src/Batch/MpiImpl.hxx index beeac0301..07f306bdf 100644 --- a/src/Launcher/MpiImpl.hxx +++ b/src/Batch/MpiImpl.hxx @@ -30,8 +30,14 @@ #define _BL_MPIIMPL_H_ #include -#include "Utils_SALOME_Exception.hxx" -#include + +class MpiImplException +{ +public: + const std::string msg; + + MpiImplException(const std::string m) : msg(m) {} +}; class MpiImpl { @@ -128,4 +134,23 @@ private: }; +class MpiImpl_SLURM : public MpiImpl +{ +public: + // Constructeur et destructeur + MpiImpl_SLURM(); // constructor + virtual ~MpiImpl_SLURM(); //Destructor + + std::string size(); // get number of process of current job + std::string rank(); // get process number of current job + std::string boot( const std::string machinefile, const unsigned int nbnodes); // get boot command + std::string run( const std::string machinefile, const unsigned int nbproc, const std::string fileNameToExecute); // get run command + std::string halt(); // get stop command + +protected: + +private: + +}; + #endif diff --git a/src/Container/Makefile.am b/src/Container/Makefile.am index a7cbd05da..0ad023f7a 100644 --- a/src/Container/Makefile.am +++ b/src/Container/Makefile.am @@ -83,6 +83,7 @@ COMMON_LIBS =\ ../Registry/libRegistry.la \ ../Notification/libSalomeNotification.la \ ../ResourcesManager/libSalomeResourcesManager.la \ + ../ResourcesManager/libResourcesManager.la \ ../NamingService/libSalomeNS.la \ ../Utils/libOpUtil.la \ ../SALOMELocalTrace/libSALOMELocalTrace.la \ diff --git a/src/Container/SALOME_ContainerManager.cxx b/src/Container/SALOME_ContainerManager.cxx index fe6cc3165..52802a144 100644 --- a/src/Container/SALOME_ContainerManager.cxx +++ b/src/Container/SALOME_ContainerManager.cxx @@ -21,6 +21,7 @@ #include "SALOME_NamingService.hxx" #include "OpUtil.hxx" #include +#include #ifndef WNT #include #endif @@ -75,6 +76,8 @@ SALOME_ContainerManager::SALOME_ContainerManager(CORBA::ORB_ptr orb, PortableSer Engines::ContainerManager::_narrow(obj); _NS->Register(refContMan,_ContainerManagerNameInNS); + _MpiStarted = false; + _isAppliSalomeDefined = (getenv("APPLI") != 0); MESSAGE("constructor end"); } @@ -211,17 +214,21 @@ StartContainer(const Engines::MachineParameters& params, MESSAGE("SALOME_ContainerManager::StartContainer " << possibleComputers.length()); + vector lm; + for(int i=0;iFindFirst(possibleComputers); + theMachine=_ResManager->GetImpl()->FindFirst(lm); break; case Engines::P_CYCL: - theMachine=_ResManager->FindNext(possibleComputers); + theMachine=_ResManager->GetImpl()->FindNext(lm); break; case Engines::P_BEST: - theMachine=_ResManager->FindBest(possibleComputers); + theMachine=_ResManager->GetImpl()->FindBest(lm); break; } } @@ -252,11 +259,11 @@ StartContainer(const Engines::MachineParameters& params, return Engines::Container::_nil(); } else if(theMachine==GetHostname()) - command=_ResManager->BuildCommandToLaunchLocalContainer(params,id); + command = BuildCommandToLaunchLocalContainer(params,id); else - command = _ResManager->BuildCommandToLaunchRemoteContainer(theMachine,params,id); + command = BuildCommandToLaunchRemoteContainer(theMachine,params,id); - _ResManager->RmTmpFile(); + RmTmpFile(); //check if an entry exists in Naming service if(params.isMPI) @@ -390,7 +397,7 @@ FindOrStartParallelContainer(const Engines::MachineParameters& params_const, params.hostname = CORBA::string_dup(theMachine.c_str()); Engines::MachineParameters params_proxy(params); try { - command = _ResManager->BuildCommandToLaunchLocalParallelContainer("SALOME_ParallelContainerProxy", params_proxy, "xterm"); + command = BuildCommandToLaunchLocalParallelContainer("SALOME_ParallelContainerProxy", params_proxy, "xterm"); } catch(const SALOME_Exception & ex){ MESSAGE(ex.what()); @@ -403,7 +410,7 @@ FindOrStartParallelContainer(const Engines::MachineParameters& params_const, proxy = PaCO::InterfaceManager::_narrow(obj); // Step 4 : starting parallel container nodes - command = _ResManager->BuildCommandToLaunchLocalParallelContainer("SALOME_ParallelContainerNode", params, "xterm"); + command = BuildCommandToLaunchLocalParallelContainer("SALOME_ParallelContainerNode", params, "xterm"); string name = _NS->ContainerName(params) + "Node"; LaunchParallelContainer(command, params, name); // Step 5 : connecting nodes and the proxy to actually create a parallel container @@ -686,3 +693,579 @@ void SALOME_ContainerManager::fillBatchLaunchedContainers() } _batchLaunchedContainersIter=_batchLaunchedContainers.begin(); } + +//============================================================================= +/*! + * This is no longer valid (C++ container are also python containers) + */ +//============================================================================= + +bool isPythonContainer(const char* ContainerName) +{ + bool ret = false; + int len = strlen(ContainerName); + + if (len >= 2) + if (strcmp(ContainerName + len - 2, "Py") == 0) + ret = true; + + return ret; +} + +//============================================================================= +/*! + * Builds the script to be launched + * + * If SALOME Application not defined ($APPLI), + * see BuildTempFileToLaunchRemoteContainer() + * + * Else rely on distant configuration. Command is under the form (example): + * ssh user@machine distantPath/runRemote.sh hostNS portNS WORKINGDIR workingdir \ + * SALOME_Container containerName &" + + * - where user is ommited if not specified in CatalogResources, + * - where distant path is always relative to user@machine $HOME, and + * equal to $APPLI if not specified in CatalogResources, + * - where hostNS is the hostname of CORBA naming server (set by scripts to + * use to launch SALOME and servers in $APPLI: runAppli.sh, runRemote.sh) + * - where portNS is the port used by CORBA naming server (set by scripts to + * use to launch SALOME and servers in $APPLI: runAppli.sh, runRemote.sh) + * - where workingdir is the requested working directory for the container. + * If WORKINGDIR (and workingdir) is not present the working dir will be $HOME + */ +//============================================================================= + +string +SALOME_ContainerManager::BuildCommandToLaunchRemoteContainer +(const string& machine, + const Engines::MachineParameters& params, const long id) +{ + string command; + int nbproc; + char idc[3*sizeof(long)]; + + if ( ! _isAppliSalomeDefined ) + command = BuildTempFileToLaunchRemoteContainer(machine, params); + + else + { + const ParserResourcesType& resInfo = _ResManager->GetImpl()->GetResourcesList(machine); + + if (params.isMPI) + { + if ( (params.nb_node <= 0) && (params.nb_proc_per_node <= 0) ) + nbproc = 1; + else if ( params.nb_node == 0 ) + nbproc = params.nb_proc_per_node; + else if ( params.nb_proc_per_node == 0 ) + nbproc = params.nb_node; + else + nbproc = params.nb_node * params.nb_proc_per_node; + } + + // "ssh user@machine distantPath/runRemote.sh hostNS portNS WORKINGDIR workingdir \ + // SALOME_Container containerName &" + + if (resInfo.Protocol == rsh) + command = "rsh "; + else if (resInfo.Protocol == ssh) + command = "ssh "; + else + throw SALOME_Exception("Unknown protocol"); + + if (resInfo.UserName != "") + { + command += resInfo.UserName; + command += "@"; + } + + command += machine; + command += " "; + + if (resInfo.AppliPath != "") + command += resInfo.AppliPath; // path relative to user@machine $HOME + else + { + ASSERT(getenv("APPLI")); + command += getenv("APPLI"); // path relative to user@machine $HOME + } + + command += "/runRemote.sh "; + + ASSERT(getenv("NSHOST")); + command += getenv("NSHOST"); // hostname of CORBA name server + + command += " "; + ASSERT(getenv("NSPORT")); + command += getenv("NSPORT"); // port of CORBA name server + + std::string wdir=params.workingdir.in(); + if(wdir != "") + { + command += " WORKINGDIR "; + command += " '"; + if(wdir == "$TEMPDIR") + wdir="\\$TEMPDIR"; + command += wdir; // requested working directory + command += "'"; + } + + if(params.isMPI) + { + command += " mpirun -np "; + std::ostringstream o; + o << nbproc << " "; + command += o.str(); +#ifdef WITHLAM + command += "-x PATH,LD_LIBRARY_PATH,OMNIORB_CONFIG,SALOME_trace "; +#endif + command += " SALOME_MPIContainer "; + } + else + command += " SALOME_Container "; + + command += _NS->ContainerName(params); + command += " -id "; + sprintf(idc,"%ld",id); + command += idc; + command += " -"; + AddOmninamesParams(command); + + MESSAGE("command =" << command); + } + + return command; +} + +//============================================================================= +/*! + * builds the command to be launched. + */ +//============================================================================= + +string +SALOME_ContainerManager::BuildCommandToLaunchLocalContainer +(const Engines::MachineParameters& params, const long id) +{ + _TmpFileName = ""; + string command; + int nbproc = 0; + char idc[3*sizeof(long)]; + + if (params.isMPI) + { + command = "mpirun -np "; + + if ( (params.nb_node <= 0) && (params.nb_proc_per_node <= 0) ) + nbproc = 1; + else if ( params.nb_node == 0 ) + nbproc = params.nb_proc_per_node; + else if ( params.nb_proc_per_node == 0 ) + nbproc = params.nb_node; + else + nbproc = params.nb_node * params.nb_proc_per_node; + + std::ostringstream o; + + o << nbproc << " "; + + command += o.str(); +#ifdef WITHLAM + command += "-x PATH,LD_LIBRARY_PATH,OMNIORB_CONFIG,SALOME_trace "; +#endif + + if (isPythonContainer(params.container_name)) + command += "pyMPI SALOME_ContainerPy.py "; + else + command += "SALOME_MPIContainer "; + } + + else + { + command=""; + std::string wdir=params.workingdir.in(); + if(wdir != "") + { + // a working directory is requested + if(wdir == "$TEMPDIR") + { + // a new temporary directory is requested + char dir[]="/tmp/salomeXXXXXX"; + char* mdir=mkdtemp(dir); + if(mdir==NULL) + std::cerr << "Problem in mkdtemp " << dir << " " << mdir << std::endl; + else + command="cd "+std::string(dir)+";"; + } + else + { + // a permanent directory is requested use it or create it + command="mkdir -p " + wdir + " && cd " + wdir + ";"; + } + } + if (isPythonContainer(params.container_name)) + command += "SALOME_ContainerPy.py "; + else + command += "SALOME_Container "; + } + + command += _NS->ContainerName(params); + command += " -id "; + sprintf(idc,"%ld",id); + command += idc; + command += " -"; + AddOmninamesParams(command); + + MESSAGE("Command is ... " << command); + return command; +} + + +//============================================================================= +/*! + * removes the generated temporary file in case of a remote launch. + */ +//============================================================================= + +void SALOME_ContainerManager::RmTmpFile() +{ + if (_TmpFileName != "") + { +#ifndef WNT + string command = "rm "; +#else + string command = "del /F "; +#endif + command += _TmpFileName; + char *temp = strdup(command.c_str()); + int lgthTemp = strlen(temp); + temp[lgthTemp - 3] = '*'; + temp[lgthTemp - 2] = '\0'; + system(temp); + free(temp); + } +} + +//============================================================================= +/*! + * add to command all options relative to naming service. + */ +//============================================================================= + +void SALOME_ContainerManager::AddOmninamesParams(string& command) const + { + CORBA::String_var iorstr = _NS->getIORaddr(); + command += "ORBInitRef NameService="; + command += iorstr; + } + + +//============================================================================= +/*! + * add to command all options relative to naming service. + */ +//============================================================================= + +void SALOME_ContainerManager::AddOmninamesParams(ofstream& fileStream) const + { + CORBA::String_var iorstr = _NS->getIORaddr(); + fileStream << "ORBInitRef NameService="; + fileStream << iorstr; + } + +//============================================================================= +/*! + * generate a file name in /tmp directory + */ +//============================================================================= + +string SALOME_ContainerManager::BuildTemporaryFileName() const + { + //build more complex file name to support multiple salome session + char *temp = new char[19]; + strcpy(temp, "/tmp/command"); + strcat(temp, "XXXXXX"); +#ifndef WNT + + mkstemp(temp); +#else + + char aPID[80]; + itoa(getpid(), aPID, 10); + strcat(temp, aPID); +#endif + + string command(temp); + delete [] temp; + command += ".sh"; + return command; + } + + +//============================================================================= +/*! + * Builds in a temporary file the script to be launched. + * + * Used if SALOME Application ($APPLI) is not defined. + * The command is build with data from CatalogResources, in which every path + * used on remote computer must be defined. + */ +//============================================================================= + +string +SALOME_ContainerManager::BuildTempFileToLaunchRemoteContainer +(const string& machine, + const Engines::MachineParameters& params) throw(SALOME_Exception) +{ + int status; + + _TmpFileName = BuildTemporaryFileName(); + ofstream tempOutputFile; + tempOutputFile.open(_TmpFileName.c_str(), ofstream::out ); + const ParserResourcesType& resInfo = _ResManager->GetImpl()->GetResourcesList(machine); + tempOutputFile << "#! /bin/sh" << endl; + + // --- set env vars + + tempOutputFile << "export SALOME_trace=local" << endl; // mkr : 27.11.2006 : PAL13967 - Distributed supervision graphs - Problem with "SALOME_trace" + //tempOutputFile << "source " << resInfo.PreReqFilePath << endl; + + // ! env vars + + if (params.isMPI) + { + tempOutputFile << "mpirun -np "; + int nbproc; + + if ( (params.nb_node <= 0) && (params.nb_proc_per_node <= 0) ) + nbproc = 1; + else if ( params.nb_node == 0 ) + nbproc = params.nb_proc_per_node; + else if ( params.nb_proc_per_node == 0 ) + nbproc = params.nb_node; + else + nbproc = params.nb_node * params.nb_proc_per_node; + + std::ostringstream o; + + tempOutputFile << nbproc << " "; +#ifdef WITHLAM + tempOutputFile << "-x PATH,LD_LIBRARY_PATH,OMNIORB_CONFIG,SALOME_trace "; +#endif + } + + tempOutputFile << getenv("KERNEL_ROOT_DIR") << "/bin/salome/"; + + if (params.isMPI) + { + if (isPythonContainer(params.container_name)) + tempOutputFile << "pyMPI SALOME_ContainerPy.py "; + else + tempOutputFile << "SALOME_MPIContainer "; + } + + else + { + if (isPythonContainer(params.container_name)) + tempOutputFile << "SALOME_ContainerPy.py "; + else + tempOutputFile << "SALOME_Container "; + } + + tempOutputFile << _NS->ContainerName(params) << " -"; + AddOmninamesParams(tempOutputFile); + tempOutputFile << " &" << endl; + tempOutputFile.flush(); + tempOutputFile.close(); + chmod(_TmpFileName.c_str(), 0x1ED); + + // --- Build command + + string command; + + if (resInfo.Protocol == rsh) + { + command = "rsh "; + string commandRcp = "rcp "; + commandRcp += _TmpFileName; + commandRcp += " "; + commandRcp += machine; + commandRcp += ":"; + commandRcp += _TmpFileName; + status = system(commandRcp.c_str()); + } + + else if (resInfo.Protocol == ssh) + { + command = "ssh "; + string commandRcp = "scp "; + commandRcp += _TmpFileName; + commandRcp += " "; + commandRcp += machine; + commandRcp += ":"; + commandRcp += _TmpFileName; + status = system(commandRcp.c_str()); + } + else + throw SALOME_Exception("Unknown protocol"); + + if(status) + throw SALOME_Exception("Error of connection on remote host"); + + command += machine; + _CommandForRemAccess = command; + command += " "; + command += _TmpFileName; + + SCRUTE(command); + + return command; + +} + +//============================================================================= +/*! Creates a command line that the container manager uses to launch + * a parallel container. + */ +//============================================================================= +string +SALOME_ContainerManager::BuildCommandToLaunchLocalParallelContainer(const std::string& exe_name, + const Engines::MachineParameters& params, + const std::string& log) +{ + // This method knows the differences between the proxy and the nodes. + // nb_component_nodes is not used in the same way if it is a proxy or + // a node. + + string command; + string parallelLib(CORBA::string_dup(params.parallelLib)); + string hostname(CORBA::string_dup(params.hostname)); + int par = exe_name.find("Proxy"); + int nbproc = params.nb_component_nodes; + char buffer [33]; + sprintf(buffer,"%d",nbproc); + + Engines::MachineParameters_var rtn = new Engines::MachineParameters(); + rtn->container_name = params.container_name; + rtn->hostname = params.hostname; + rtn->OS = params.OS; + rtn->mem_mb = params.mem_mb; + rtn->cpu_clock = params.cpu_clock; + rtn->nb_proc_per_node = params.nb_proc_per_node; + rtn->nb_node = params.nb_node; + rtn->isMPI = params.isMPI; + + string real_exe_name = exe_name + parallelLib; + + if (parallelLib == "Dummy") + { + //command = "gdb --args "; + //command = "valgrind --tool=memcheck --log-file=val_log "; + //command += real_exe_name; + + command = real_exe_name; + + command += " " + _NS->ContainerName(rtn); + command += " " + parallelLib; + command += " " + hostname; + command += " -"; + AddOmninamesParams(command); + } + + else if (parallelLib == "Mpi") + { + // Step 1 : check if MPI is started + if (_MpiStarted == false) + { + startMPI(); + } + + if (par < 0) + { + // Nodes case + + command = "mpiexec -np " + string(buffer) + " "; +// command += "gdb --args "; + command += real_exe_name; + command += " " + _NS->ContainerName(rtn); + command += " " + parallelLib; + command += " " + hostname; + command += " -"; + AddOmninamesParams(command); + } + else + { + // Proxy case + command = "mpiexec -np 1 "; + command += real_exe_name; + command += " " + _NS->ContainerName(rtn); + command += " " + string(buffer); + command += " " + parallelLib; + command += " " + hostname; + command += " -"; + AddOmninamesParams(command); + } + } + else + { + std::string message("Unknown parallelLib" + parallelLib); + throw SALOME_Exception(message.c_str()); + } + + // log choice + if (log == "default") + { + command += " > /tmp/"; + command += _NS->ContainerName(rtn); + command += "_"; + command += GetHostname(); + command += "_"; + command += getenv( "USER" ) ; + command += ".log 2>&1 &" ; + } + if (log == "xterm") + { + command = "/usr/X11R6/bin/xterm -e \"export LD_LIBRARY_PATH=$LD_LIBRARY_PATH; export PATH=$PATH; " + + command + " \" &"; +// + command + "; echo $LD_LIBRARY_PATH; cat \" &"; + } + return command; + +/* if (log == "xterm") + { + command = "/usr/X11R6/bin/xterm -e \"export LD_LIBRARY_PATH=$LD_LIBRARY_PATH; export PATH=$PATH; echo $LD_LIBRARY_PATH; echo $PATH; " + command + "; cat \" &"; + } +*/ +/* command = "cd ; rm " + fichier_commande + "; touch " + \ + fichier_commande + "; echo \" export LD_LIBRARY_PATH=$LD_LIBRARY_PATH; " + \ + command + " >& /tmp/ribes_" + fichier_commande + " & \" > " + fichier_commande + ";"; + command += "ssh cn01 sh " + fichier_commande + " &"; + cerr << "La commande : " << command << endl; +*/ +} + +void SALOME_ContainerManager::startMPI() +{ + cerr << "----------------------------------------------" << endl; + cerr << "----------------------------------------------" << endl; + cerr << "----------------------------------------------" << endl; + cerr << "-Only Lam on Localhost is currently supported-" << endl; + cerr << "----------------------------------------------" << endl; + cerr << "----------------------------------------------" << endl; + cerr << "----------------------------------------------" << endl; + + int status = system("lamboot"); + if (status == -1) + { + INFOS("lamboot failed : system command status -1"); + } + else if (status == 217) + { + INFOS("lamboot failed : system command status 217"); + } + else + { + _MpiStarted = true; + } +} + diff --git a/src/Container/SALOME_ContainerManager.hxx b/src/Container/SALOME_ContainerManager.hxx index 34888f98c..d8be68833 100644 --- a/src/Container/SALOME_ContainerManager.hxx +++ b/src/Container/SALOME_ContainerManager.hxx @@ -86,6 +86,30 @@ protected: void fillBatchLaunchedContainers(); long GetIdForContainer(void); + + std::string BuildCommandToLaunchRemoteContainer(const std::string& machine, + const Engines::MachineParameters& params, const long id); + + std::string BuildCommandToLaunchLocalContainer(const Engines::MachineParameters& params, const long id); + + std::string BuildTempFileToLaunchRemoteContainer(const std::string& machine, + const Engines::MachineParameters& params) throw(SALOME_Exception); + + void RmTmpFile(); + + void AddOmninamesParams(std::string& command) const; + + void AddOmninamesParams(std::ofstream& fileStream) const; + + std::string BuildTemporaryFileName() const; + + // Parallel extension + std::string BuildCommandToLaunchLocalParallelContainer(const std::string& exe_name, + const Engines::MachineParameters& params, + const std::string& log = "default"); + void startMPI(); + bool _MpiStarted; + long _id; CORBA::ORB_var _orb; PortableServer::POA_var _poa; @@ -94,6 +118,16 @@ protected: SALOME_NamingService *_NS; static std::vector _batchLaunchedContainers; static std::vector::iterator _batchLaunchedContainersIter; -}; + //! attribute that contains current tmp files generated + std::string _TmpFileName; + + //! contains the rsh or ssh command to access directly to machine. + // Only used by this->RmTmpFile in case of a remote launch. + std::string _CommandForRemAccess; + + //! different behaviour if $APPLI exists (SALOME Application) + bool _isAppliSalomeDefined; + +}; #endif diff --git a/src/Launcher/BatchLight_BatchManager.cxx b/src/Launcher/BatchLight_BatchManager.cxx deleted file mode 100644 index 5d7dc2f46..000000000 --- a/src/Launcher/BatchLight_BatchManager.cxx +++ /dev/null @@ -1,332 +0,0 @@ -// Copyright (C) 2005 OPEN CASCADE, EADS/CCR, LIP6, CEA/DEN, -// CEDRAT, EDF R&D, LEG, PRINCIPIA R&D, BUREAU VERITAS -// -// This library is free software; you can redistribute it and/or -// modify it under the terms of the GNU Lesser General Public -// License as published by the Free Software Foundation; either -// version 2.1 of the License. -// -// This library is distributed in the hope that it will be useful -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -// Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public -// License along with this library; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -// -// See http://www.salome-platform.org/ or email : webmaster.salome@opencascade.com -// -/* - * BatchManager.cxx : - * - * Auteur : Bernard SECHER - CEA/DEN - * Date : Juillet 2007 - * Projet : SALOME - * - */ - -#include -#include -#include -#include -#include "BatchLight_Job.hxx" -#include "BatchLight_BatchManager.hxx" -#include "Batch_Date.hxx" -using namespace std; - -namespace BatchLight { - - // Constructeur - BatchManager::BatchManager(const batchParams& p) throw(SALOME_Exception) : _params(p) - { - SCRUTE(_params.hostname); - SCRUTE(_params.protocol); - SCRUTE(_params.username); - // On verifie que le hostname est correct - if (!gethostbyname(_params.hostname.c_str())) { // hostname unknown from network - string msg = "hostname \""; - msg += _params.hostname; - msg += "\" unknown from the network"; - throw SALOME_Exception(msg.c_str()); - } - _mpiImpl = NULL; - } - - // Destructeur - BatchManager::~BatchManager() - { - MESSAGE("BatchManager destructor "<<_params.hostname); - std::map < int, const BatchLight::Job * >::const_iterator it; - for(it=_jobmap.begin();it!=_jobmap.end();it++) - delete it->second; - if(_mpiImpl) delete _mpiImpl; - } - - // Methode pour le controle des jobs : soumet un job au gestionnaire - const int BatchManager::submitJob(Job* job) - { - BEGIN_OF("BatchManager::submitJob"); - int id; - - // temporary directory on cluster to put input files for job - setDirForTmpFiles(job); - - // Set Home director - setHomeDir(job); - - // export input files on cluster - exportInputFiles(job); - - // build salome coupling script for job - buildSalomeCouplingScript(job); - - // build batch script for job - buildSalomeBatchScript(job); - - // submit job on cluster - id = submit(job); - - // register job on map - _jobmap[id] = job; - END_OF("BatchManager::submitJob"); - return id; - } - - void BatchManager::setDirForTmpFiles(BatchLight::Job* job) - { - std::string dirForTmpFiles; - std::string thedate; - - // Adding date to the directory name - Batch::Date date = Batch::Date(time(0)); - thedate = date.str(); - int lend = thedate.size() ; - int i = 0 ; - while ( i < lend ) { - if ( thedate[i] == '/' || thedate[i] == '-' || thedate[i] == ':' ) { - thedate[i] = '_' ; - } - i++ ; - } - - dirForTmpFiles += string("Batch/"); - dirForTmpFiles += thedate ; - job->setDirForTmpFiles(dirForTmpFiles); - } - - void BatchManager::setHomeDir(BatchLight::Job* job) - { - std::string home; - std::string command; - const std::string dirForTmpFiles = job->getDirForTmpFiles(); - int idx = dirForTmpFiles.find("Batch/"); - std::string filelogtemp = dirForTmpFiles.substr(idx+6, dirForTmpFiles.length()); - filelogtemp = "/tmp/logs" + filelogtemp + "_home"; - - if( _params.protocol == "rsh" ) - command = "rsh "; - else if( _params.protocol == "ssh" ) - command = "ssh "; - else - throw SALOME_Exception("Unknown protocol"); - if (_params.username != ""){ - command += _params.username; - command += "@"; - } - command += _params.hostname; - command += " 'echo $HOME' > "; - command += filelogtemp; - SCRUTE(command.c_str()); - int status = system(command.c_str()); - if(status) - throw SALOME_Exception("Error of launching home command on remote host"); - - std::ifstream file_home(filelogtemp.c_str()); - std::getline(file_home, home); - file_home.close(); - job->setHomeDir(home); - } - - void BatchManager::exportInputFiles(BatchLight::Job* job) throw(SALOME_Exception) - { - BEGIN_OF("BatchManager::exportInFiles"); - int status; - const char * fileToExecute = job->getFileToExecute(); - const Engines::FilesList filesToExportList = job->getFilesToExportList(); - const std::string dirForTmpFiles = job->getDirForTmpFiles(); - std::string command; - std::string copy_command; - - // Test protocol - if( _params.protocol == "rsh" ) - copy_command = "rcp "; - else if( _params.protocol == "ssh" ) - copy_command = "scp "; - else - throw SALOME_Exception("Unknown protocol : only rsh and ssh are known !"); - - // First step : creating batch tmp files directory - command = _params.protocol; - command += " "; - if (_params.username != ""){ - command += _params.username; - command += "@"; - } - command += _params.hostname; - command += " \"mkdir -p "; - command += dirForTmpFiles; - command += "\"" ; - SCRUTE(command.c_str()); - status = system(command.c_str()); - if(status) { - std::ostringstream oss; - oss << status; - std::string ex_mess("Error of connection on remote host ! status = "); - ex_mess += oss.str(); - throw SALOME_Exception(ex_mess.c_str()); - } - - // Second step : copy fileToExecute into - // batch tmp files directory - command = copy_command; - command += fileToExecute; - command += " "; - if (_params.username != ""){ - command += _params.username; - command += "@"; - } - command += _params.hostname; - command += ":"; - command += dirForTmpFiles; - SCRUTE(command.c_str()); - status = system(command.c_str()); - if(status) { - std::ostringstream oss; - oss << status; - std::string ex_mess("Error of connection on remote host ! status = "); - ex_mess += oss.str(); - throw SALOME_Exception(ex_mess.c_str()); - } - - // Third step : copy filesToExportList into - // batch tmp files directory - for (int i = 0 ; i < filesToExportList.length() ; i++ ) { - command = copy_command; - command += filesToExportList[i] ; - command += " "; - if (_params.username != ""){ - command += _params.username; - command += "@"; - } - command += _params.hostname; - command += ":"; - command += dirForTmpFiles ; - SCRUTE(command.c_str()); - status = system(command.c_str()); - if(status) { - std::ostringstream oss; - oss << status; - std::string ex_mess("Error of connection on remote host ! status = "); - ex_mess += oss.str(); - throw SALOME_Exception(ex_mess.c_str()); - } - } - - END_OF("BatchManager::exportInFiles"); - } - - void BatchManager::importOutputFiles( const char *directory, const CORBA::Long jobId ) throw(SALOME_Exception) - { - BEGIN_OF("BatchManager::importOutputFiles"); - string command; - int status; - - const BatchLight::Job* myJob = _jobmap[jobId]; - Engines::FilesList filesToImportList = myJob->getFilesToImportList(); - - for ( int i = 0 ; i < filesToImportList.length() ; i++ ) { - if( _params.protocol == "rsh" ) - command = "rcp "; - else if( _params.protocol == "ssh" ) - command = "scp "; - else - throw SALOME_Exception("Unknown protocol"); - if (_params.username != ""){ - command += _params.username; - command += "@"; - } - command += _params.hostname; - command += ":"; - command += filesToImportList[i] ; - command += " "; - command += directory; - SCRUTE(command.c_str()); - status = system(command.c_str()); - if(status) - { - // Try to get what we can (logs files) - // throw SALOME_Exception("Error of connection on remote host"); - std::string mess("Copy command failed ! status is :"); - ostringstream status_str; - status_str << status; - mess += status_str.str(); - INFOS(mess); - } - } - - END_OF("BatchManager::importOutputFiles"); - } - - string BatchManager::BuildTemporaryFileName() const - { - //build more complex file name to support multiple salome session - char *temp = new char[19]; - strcpy(temp, "/tmp/command"); - strcat(temp, "XXXXXX"); -#ifndef WNT - mkstemp(temp); -#else - char aPID[80]; - itoa(getpid(), aPID, 10); - strcat(temp, aPID); -#endif - - string command(temp); - delete [] temp; - command += ".sh"; - return command; - } - - void BatchManager::RmTmpFile(std::string & TemporaryFileName) - { - string command = "rm "; - command += TemporaryFileName; - char *temp = strdup(command.c_str()); - int lgthTemp = strlen(temp); - temp[lgthTemp - 3] = '*'; - temp[lgthTemp - 2] = '\0'; - system(temp); - free(temp); - } - - MpiImpl *BatchManager::FactoryMpiImpl(string mpiImpl) throw(SALOME_Exception) - { - if(mpiImpl == "lam") - return new MpiImpl_LAM(); - else if(mpiImpl == "mpich1") - return new MpiImpl_MPICH1(); - else if(mpiImpl == "mpich2") - return new MpiImpl_MPICH2(); - else if(mpiImpl == "openmpi") - return new MpiImpl_OPENMPI(); - else if(mpiImpl == "indif") - throw SALOME_Exception("you must specify a mpi implementation in CatalogResources.xml file"); - else{ - ostringstream oss; - oss << mpiImpl << " : not yet implemented"; - throw SALOME_Exception(oss.str().c_str()); - } - } - -} diff --git a/src/Launcher/BatchLight_BatchManager.hxx b/src/Launcher/BatchLight_BatchManager.hxx deleted file mode 100644 index c5d3c70c3..000000000 --- a/src/Launcher/BatchLight_BatchManager.hxx +++ /dev/null @@ -1,91 +0,0 @@ -// Copyright (C) 2005 OPEN CASCADE, EADS/CCR, LIP6, CEA/DEN, -// CEDRAT, EDF R&D, LEG, PRINCIPIA R&D, BUREAU VERITAS -// -// This library is free software; you can redistribute it and/or -// modify it under the terms of the GNU Lesser General Public -// License as published by the Free Software Foundation; either -// version 2.1 of the License. -// -// This library is distributed in the hope that it will be useful -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -// Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public -// License along with this library; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -// -// See http://www.salome-platform.org/ or email : webmaster.salome@opencascade.com -// -/* - * BatchManager.hxx : - * - * Auteur : Bernard SECHER - CEA/DEN - * Date : Juillet 2007 - * Projet : SALOME - * - */ - -#ifndef _BL_BATCHMANAGER_H_ -#define _BL_BATCHMANAGER_H_ - -#include -#include -#include -#include -#include "Utils_SALOME_Exception.hxx" -#include -#include -#include CORBA_CLIENT_HEADER(SALOME_ContainerManager) -#include "MpiImpl.hxx" - -namespace BatchLight { - - class Job; - - struct batchParams{ - std::string hostname; // serveur ou tourne le BatchManager - std::string protocol; // protocole d'acces au serveur: ssh ou rsh - std::string username; // username d'acces au serveur - std::string applipath; // path of apllication directory on server - std::vector modulesList; // list of Salome modules installed on server - unsigned int nbnodes; // number of nodes on cluster - unsigned int nbprocpernode; // number of processors on each node - std::string mpiImpl; // mpi implementation - }; - - class BatchManager - { - public: - // Constructeur et destructeur - BatchManager(const batchParams& p) throw(SALOME_Exception); // connexion a la machine host - virtual ~BatchManager(); - - // Methodes pour le controle des jobs : virtuelles pures - const int submitJob(BatchLight::Job* job); // soumet un job au gestionnaire - virtual void deleteJob(const int & jobid) = 0; // retire un job du gestionnaire - virtual std::string queryJob(const int & jobid) = 0; // renvoie l'etat du job - void importOutputFiles( const char *directory, const CORBA::Long jobId ) throw(SALOME_Exception); - - protected: - batchParams _params; - MpiImpl *_mpiImpl; - std::map _jobmap; - - virtual int submit(BatchLight::Job* job) throw(SALOME_Exception) = 0; - void setDirForTmpFiles(BatchLight::Job* job); - void setHomeDir(BatchLight::Job* job); - void exportInputFiles(BatchLight::Job* job) throw(SALOME_Exception); - virtual void buildSalomeCouplingScript(BatchLight::Job* job) throw(SALOME_Exception) = 0; - virtual void buildSalomeBatchScript(BatchLight::Job* job) throw(SALOME_Exception) = 0; - - std::string BuildTemporaryFileName() const; - void RmTmpFile(std::string & TemporaryFileName); - MpiImpl *FactoryMpiImpl(std::string mpiImpl) throw(SALOME_Exception); - private: - - }; - -} - -#endif diff --git a/src/Launcher/BatchLight_BatchManager_PBS.cxx b/src/Launcher/BatchLight_BatchManager_PBS.cxx deleted file mode 100644 index 1eeefc2de..000000000 --- a/src/Launcher/BatchLight_BatchManager_PBS.cxx +++ /dev/null @@ -1,476 +0,0 @@ -// Copyright (C) 2005 OPEN CASCADE, EADS/CCR, LIP6, CEA/DEN, -// CEDRAT, EDF R&D, LEG, PRINCIPIA R&D, BUREAU VERITAS -// -// This library is free software; you can redistribute it and/or -// modify it under the terms of the GNU Lesser General Public -// License as published by the Free Software Foundation; either -// version 2.1 of the License. -// -// This library is distributed in the hope that it will be useful -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -// Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public -// License along with this library; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -// -// See http://www.salome-platform.org/ or email : webmaster.salome@opencascade.com -// -/* - * BatchManager.cxx : - * - * Auteur : Bernard SECHER - CEA/DEN - * Date : Juillet 2007 - * Projet : SALOME - * - */ - -#include "BatchLight_BatchManager_PBS.hxx" -#include "utilities.h" -#include "BatchLight_Job.hxx" -#include -#include -#include -#include - -using namespace std; - -namespace BatchLight { - - // Constructeur - BatchManager_PBS::BatchManager_PBS(const batchParams& p) throw(SALOME_Exception) : BatchManager(p) - { - // pbs batch system needs to know mpi implementation - _mpiImpl = FactoryMpiImpl(_params.mpiImpl); - } - - // Destructeur - BatchManager_PBS::~BatchManager_PBS() - { - MESSAGE("BatchManager_PBS destructor "<<_params.hostname); - } - - // Methode pour le controle des jobs : retire un job du gestionnaire - void BatchManager_PBS::deleteJob(const int & jobid) - { - BEGIN_OF("BatchManager_PBS::deleteJob"); - string command; - int status; - ostringstream oss; - oss << jobid; - - // define command to submit batch - if( _params.protocol == "rsh" ) - command = "rsh "; - else if( _params.protocol == "ssh" ) - command = "ssh "; - else - throw SALOME_Exception("Unknown protocol"); - - if (_params.username != ""){ - command += _params.username; - command += "@"; - } - - command += _params.hostname; - command += " \"qdel " ; - command += oss.str(); - command += "\""; - SCRUTE(command.c_str()); - status = system(command.c_str()); - if(status) - throw SALOME_Exception("Error of connection on remote host"); - - MESSAGE("jobId = " << jobid << "killed"); - END_OF("BatchManager_PBS::deleteJob"); - } - - // Methode pour le controle des jobs : renvoie l'etat du job - string BatchManager_PBS::queryJob(const int & jobid) - { - BEGIN_OF("BatchManager_PBS::queryJob"); - // define name of log file - string jstatus; - string logFile="/tmp/logs/"; - logFile += getenv("USER"); - logFile += "/batchSalome_"; - - //srand ( time(NULL) ); - //int ir = rand(); - ostringstream oss; - //oss << ir; - oss << this << "_" << jobid; - logFile += oss.str(); - logFile += ".log"; - - string command; - int status; - - // define command to submit batch - if( _params.protocol == "rsh" ) - command = "rsh "; - else if( _params.protocol == "ssh" ) - command = "ssh "; - else - throw SALOME_Exception("Unknown protocol"); - - if (_params.username != ""){ - command += _params.username; - command += "@"; - } - - command += _params.hostname; - command += " \"qstat -f " ; - //ostringstream oss2; - //oss2 << jobid; - //command += oss2.str(); - command += _pbs_job_name[jobid]; - command += "\" > "; - command += logFile; - SCRUTE(command.c_str()); - status = system(command.c_str()); - if(status && status != 153 && status != 256*153){ - MESSAGE("status="<> jstatus; - iss >> jstatus; - iss >> jstatus; - } - else - jstatus = "U"; - } - - MESSAGE("jobId = " << jobid << " " << jstatus); - END_OF("BatchManager_PBS::queryJob"); - return jstatus; - } - - void BatchManager_PBS::buildSalomeCouplingScript(BatchLight::Job* job) throw(SALOME_Exception) - { - BEGIN_OF("BatchManager_PBS::buildSalomeCouplingScript"); - int status; - const char *fileToExecute = job->getFileToExecute(); - const std::string dirForTmpFiles = job->getDirForTmpFiles(); - int idx = dirForTmpFiles.find("Batch/"); - std::string filelogtemp = dirForTmpFiles.substr(idx+6, dirForTmpFiles.length()); - - string::size_type p1 = string(fileToExecute).find_last_of("/"); - string::size_type p2 = string(fileToExecute).find_last_of("."); - std::string fileNameToExecute = string(fileToExecute).substr(p1+1,p2-p1-1); - std::string TmpFileName = BuildTemporaryFileName(); - - ofstream tempOutputFile; - tempOutputFile.open(TmpFileName.c_str(), ofstream::out ); - - // Begin - tempOutputFile << "#! /bin/sh -f" << endl ; - tempOutputFile << "cd " ; - tempOutputFile << _params.applipath << endl ; - tempOutputFile << "export SALOME_BATCH=1\n"; - tempOutputFile << "export PYTHONPATH=~/" ; - tempOutputFile << dirForTmpFiles ; - tempOutputFile << ":$PYTHONPATH" << endl ; - - // Test node rank - tempOutputFile << "if test " ; - tempOutputFile << _mpiImpl->rank() ; - tempOutputFile << " = 0; then" << endl ; - - // ----------------------------------------------- - // Code for rank 0 : launch runAppli and a container - // RunAppli - tempOutputFile << " ./runAppli --terminal --modules=" ; - for ( int i = 0 ; i < _params.modulesList.size() ; i++ ) { - tempOutputFile << _params.modulesList[i] ; - if ( i != _params.modulesList.size()-1 ) - tempOutputFile << "," ; - } - tempOutputFile << " --standalone=registry,study,moduleCatalog --ns-port-log=" - << filelogtemp - << " &\n"; - - // Wait NamingService - tempOutputFile << " current=0\n" - << " stop=20\n" - << " while ! test -f " << filelogtemp << "\n" - << " do\n" - << " sleep 2\n" - << " let current=current+1\n" - << " if [ \"$current\" -eq \"$stop\" ] ; then\n" - << " echo Error Naming Service failed ! >&2" - << " exit\n" - << " fi\n" - << " done\n" - << " port=`cat " << filelogtemp << "`\n"; - - // Launch a container - tempOutputFile << " ./runSession SALOME_Container 'YACS_Server_'" - << _mpiImpl->rank() - << " > ~/" << dirForTmpFiles << "/YACS_Server_" - << _mpiImpl->rank() << "_container_log." << filelogtemp - << " 2>&1 &\n"; - - // Wait other containers - tempOutputFile << " for ((ip=0; ip < "; - tempOutputFile << _mpiImpl->size(); - tempOutputFile << " ; ip++))" << endl; - tempOutputFile << " do" << endl ; - tempOutputFile << " arglist=\"$arglist YACS_Server_\"$ip" << endl ; - tempOutputFile << " done" << endl ; - tempOutputFile << " sleep 5" << endl ; - tempOutputFile << " ./runSession waitContainers.py $arglist" << endl ; - - // Launch user script - tempOutputFile << " ./runSession python ~/" << dirForTmpFiles << "/" << fileNameToExecute << ".py\n"; - - // Stop application - tempOutputFile << " rm " << filelogtemp << "\n" - << " ./runSession killSalomeWithPort.py $port\n"; - - // ------------------------------------- - // Other nodes launch a container - tempOutputFile << "else" << endl ; - - // Wait NamingService - tempOutputFile << " current=0\n" - << " stop=20\n" - << " while ! test -f " << filelogtemp << "\n" - << " do\n" - << " sleep 2\n" - << " let current=current+1\n" - << " if [ \"$current\" -eq \"$stop\" ] ; then\n" - << " echo Error Naming Service failed ! >&2" - << " exit\n" - << " fi\n" - << " done\n" - << " port=`cat " << filelogtemp << "`\n"; - - // Launching container - tempOutputFile << " ./runSession SALOME_Container 'YACS_Server_'"; - tempOutputFile << _mpiImpl->rank() - << " > ~/" << dirForTmpFiles << "/YACS_Server_" - << _mpiImpl->rank() << "_container_log." << filelogtemp - << " 2>&1\n"; - tempOutputFile << "fi" << endl; - tempOutputFile.flush(); - tempOutputFile.close(); - chmod(TmpFileName.c_str(), 0x1ED); - SCRUTE(TmpFileName.c_str()) ; - - string command; - if( _params.protocol == "rsh" ) - command = "rcp "; - else if( _params.protocol == "ssh" ) - command = "scp "; - else - throw SALOME_Exception("Unknown protocol"); - - command += TmpFileName; - command += " "; - if (_params.username != ""){ - command += _params.username; - command += "@"; - } - command += _params.hostname; - command += ":"; - command += dirForTmpFiles ; - command += "/runSalome_" ; - command += fileNameToExecute ; - command += "_Batch.sh" ; - SCRUTE(fileNameToExecute) ; - SCRUTE(command.c_str()); - status = system(command.c_str()); - if(status) - throw SALOME_Exception("Error of connection on remote host"); - RmTmpFile(TmpFileName); - - END_OF("BatchManager_PBS::buildSalomeCouplingScript"); - } - - void BatchManager_PBS::buildSalomeBatchScript(BatchLight::Job* job) throw(SALOME_Exception) - { - BEGIN_OF("BatchManager_PBS::buildSalomeBatchScript"); - int status; - const int nbproc = job->getNbProc(); - std::string edt = job->getExpectedDuringTime(); - std::string mem = job->getMemory(); - const std::string dirForTmpFiles = job->getDirForTmpFiles(); - const char *fileToExecute = job->getFileToExecute(); - string::size_type p1 = string(fileToExecute).find_last_of("/"); - string::size_type p2 = string(fileToExecute).find_last_of("."); - std::string fileNameToExecute = string(fileToExecute).substr(p1+1,p2-p1-1); - int idx = dirForTmpFiles.find("Batch/"); - std::string filelogtemp = dirForTmpFiles.substr(idx+6, dirForTmpFiles.length()); - const std::string home = job->getHomeDir(); - - int nbmaxproc = _params.nbnodes * _params.nbprocpernode; - if( nbproc > nbmaxproc ){ - MESSAGE(nbproc << " processors asked on a cluster of " << nbmaxproc << " processors"); - throw SALOME_Exception("Too much processors asked for that cluster"); - } - - int nbnodes; - if( nbproc < _params.nbnodes ) - nbnodes = nbproc; - else - nbnodes = _params.nbnodes; - - std::string TmpFileName = BuildTemporaryFileName(); - ofstream tempOutputFile; - tempOutputFile.open(TmpFileName.c_str(), ofstream::out ); - - ostringstream filenameToExecute; - filenameToExecute << " ~/" << dirForTmpFiles << "/runSalome_" << fileNameToExecute << "_Batch.sh"; - - tempOutputFile << "#! /bin/sh -f" << endl ; - tempOutputFile << "#PBS -l nodes=" << nbnodes << endl ; - if (edt != "") - tempOutputFile << "#PBS -l walltime=" << edt << ":00" << endl ; - if (mem != "") - tempOutputFile << "#PBS -l mem=" << mem << endl ; - // In some systems qsub does not correctly expand env variables - // like PBS_O_HOME for #PBS directives.... - //tempOutputFile << "#PBS -o /$PBS_O_HOME/" << dirForTmpFiles << "/runSalome.output.log.${PBS_JOBID}" << endl ; - //tempOutputFile << "#PBS -e /$PBS_O_HOME/" << dirForTmpFiles << "/runSalome.error.log.${PBS_JOBID}" << endl ; - tempOutputFile << "#PBS -o " << home << "/" << dirForTmpFiles << "/runSalome.output.log." << filelogtemp << endl ; - tempOutputFile << "#PBS -e " << home << "/" << dirForTmpFiles << "/runSalome.error.log." << filelogtemp << endl ; - tempOutputFile << _mpiImpl->boot("${PBS_NODEFILE}",nbnodes); - tempOutputFile << _mpiImpl->run("${PBS_NODEFILE}",nbproc,filenameToExecute.str()); - tempOutputFile << _mpiImpl->halt(); - tempOutputFile.flush(); - tempOutputFile.close(); - chmod(TmpFileName.c_str(), 0x1ED); - SCRUTE(TmpFileName.c_str()) ; - - string command; - if( _params.protocol == "rsh" ) - command = "rcp "; - else if( _params.protocol == "ssh" ) - command = "scp "; - else - throw SALOME_Exception("Unknown protocol"); - command += TmpFileName; - command += " "; - if (_params.username != ""){ - command += _params.username; - command += "@"; - } - command += _params.hostname; - command += ":"; - command += dirForTmpFiles ; - command += "/" ; - command += fileNameToExecute ; - command += "_Batch.sh" ; - SCRUTE(command.c_str()); - status = system(command.c_str()); - if(status) - throw SALOME_Exception("Error of connection on remote host"); - - // Adding log files into import list files - ostringstream file_name_output; - file_name_output << "~/" << dirForTmpFiles << "/" << "runSalome.output.log*"; - ostringstream file_name_error; - file_name_error << "~/" << dirForTmpFiles << "/" << "runSalome.error.log*"; - ostringstream file_container_log; - file_container_log << "~/" << dirForTmpFiles << "/" << "YACS_Server*"; - job->addFileToImportList(file_name_output.str()); - job->addFileToImportList(file_name_error.str()); - job->addFileToImportList(file_container_log.str()); - RmTmpFile(TmpFileName); - END_OF("BatchManager_PBS::buildSalomeBatchScript"); - } - - int BatchManager_PBS::submit(BatchLight::Job* job) throw(SALOME_Exception) - { - BEGIN_OF("BatchManager_PBS::submit"); - const std::string dirForTmpFiles = job->getDirForTmpFiles(); - const char *fileToExecute = job->getFileToExecute(); - string::size_type p1 = string(fileToExecute).find_last_of("/"); - string::size_type p2 = string(fileToExecute).find_last_of("."); - std::string fileNameToExecute = string(fileToExecute).substr(p1+1,p2-p1-1); - - // define name of log file - string logFile="/tmp/logs/"; - logFile += getenv("USER"); - logFile += "/batchSalome_"; - - srand ( time(NULL) ); - int ir = rand(); - ostringstream oss; - oss << ir; - logFile += oss.str(); - logFile += ".log"; - - string command; - int status; - - // define command to submit batch - if( _params.protocol == "rsh" ) - command = "rsh "; - else if( _params.protocol == "ssh" ) - command = "ssh "; - else - throw SALOME_Exception("Unknown protocol"); - - if (_params.username != ""){ - command += _params.username; - command += "@"; - } - - command += _params.hostname; - command += " \"cd " ; - command += dirForTmpFiles; - command += "; qsub " ; - command += fileNameToExecute ; - command += "_Batch.sh\" > "; - command += logFile; - SCRUTE(command.c_str()); - status = system(command.c_str()); - if(status) - throw SALOME_Exception("Error of connection on remote host"); - - // read id of submitted job in log file - char line[128]; - FILE *fp = fopen(logFile.c_str(),"r"); - fgets( line, 128, fp); - fclose(fp); - - string sline(line); - int pos = sline.find("."); - string strjob; - if(pos == string::npos) - strjob = sline; - else - strjob = sline.substr(0,pos); - - int id; - istringstream iss(strjob); - iss >> id; - - // Ajout dans la map - _pbs_job_name[id] = sline; - END_OF("BatchManager_PBS::submit"); - return id; - } - -} diff --git a/src/Launcher/BatchLight_BatchManager_PBS.hxx b/src/Launcher/BatchLight_BatchManager_PBS.hxx deleted file mode 100644 index e0c21651b..000000000 --- a/src/Launcher/BatchLight_BatchManager_PBS.hxx +++ /dev/null @@ -1,64 +0,0 @@ -// Copyright (C) 2005 OPEN CASCADE, EADS/CCR, LIP6, CEA/DEN, -// CEDRAT, EDF R&D, LEG, PRINCIPIA R&D, BUREAU VERITAS -// -// This library is free software; you can redistribute it and/or -// modify it under the terms of the GNU Lesser General Public -// License as published by the Free Software Foundation; either -// version 2.1 of the License. -// -// This library is distributed in the hope that it will be useful -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -// Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public -// License along with this library; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -// -// See http://www.salome-platform.org/ or email : webmaster.salome@opencascade.com -// -/* - * BatchManager.hxx : - * - * Auteur : Bernard SECHER - CEA/DEN - * Date : Juillet 2007 - * Projet : SALOME - * - */ - -#ifndef _BL_BATCHMANAGER_PBS_H_ -#define _BL_BATCHMANAGER_PBS_H_ - -#include -#include "Utils_SALOME_Exception.hxx" -#include "BatchLight_BatchManager.hxx" - -namespace BatchLight { - - class Job; - - class BatchManager_PBS : public BatchManager - { - public: - // Constructeur et destructeur - BatchManager_PBS(const batchParams& p) throw(SALOME_Exception); // connexion a la machine host - virtual ~BatchManager_PBS(); - - // Methodes pour le controle des jobs : virtuelles pures - void deleteJob(const int & jobid); // retire un job du gestionnaire - std::string queryJob(const int & jobid); // renvoie l'etat du job - - private: - void buildSalomeCouplingScript(BatchLight::Job* job) throw(SALOME_Exception); - void buildSalomeBatchScript(BatchLight::Job* job) throw(SALOME_Exception); - int submit(BatchLight::Job* job) throw(SALOME_Exception); - - // Permet d'avoir la chaîne complête pour demander - // le statut du job - typedef std::map _pbs_job_name_t; - _pbs_job_name_t _pbs_job_name; - }; - -} - -#endif diff --git a/src/Launcher/BatchLight_BatchManager_SLURM.cxx b/src/Launcher/BatchLight_BatchManager_SLURM.cxx deleted file mode 100644 index d184ca6bb..000000000 --- a/src/Launcher/BatchLight_BatchManager_SLURM.cxx +++ /dev/null @@ -1,340 +0,0 @@ -// Copyright (C) 2005 OPEN CASCADE, EADS/CCR, LIP6, CEA/DEN, -// CEDRAT, EDF R&D, LEG, PRINCIPIA R&D, BUREAU VERITAS -// -// This library is free software; you can redistribute it and/or -// modify it under the terms of the GNU Lesser General Public -// License as published by the Free Software Foundation; either -// version 2.1 of the License. -// -// This library is distributed in the hope that it will be useful -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -// Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public -// License along with this library; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -// -// See http://www.salome-platform.org/ or email : webmaster.salome@opencascade.com -// -/* - * BatchManager.cxx : - * - * Auteur : Bernard SECHER - CEA/DEN - * Date : Juillet 2007 - * Projet : SALOME - * - */ - -#include "BatchLight_BatchManager_SLURM.hxx" -#include "utilities.h" -#include "BatchLight_Job.hxx" -#include -#include -#include -#include - -using namespace std; - -namespace BatchLight { - - // Constructeur - BatchManager_SLURM::BatchManager_SLURM(const batchParams& p) throw(SALOME_Exception) : BatchManager(p) - { - } - - // Destructeur - BatchManager_SLURM::~BatchManager_SLURM() - { - MESSAGE("BatchManager_SLURM destructor "<<_params.hostname); - } - - // Methode pour le controle des jobs : retire un job du gestionnaire - void BatchManager_SLURM::deleteJob(const int & jobid) - { - BEGIN_OF("BatchManager_SLURM::deleteJob"); - string command; - int status; - ostringstream oss; - oss << jobid; - - // define command to submit batch - if( _params.protocol == "rsh" ) - command = "rsh "; - else if( _params.protocol == "ssh" ) - command = "ssh "; - else - throw SALOME_Exception("Unknown protocol"); - - if (_params.username != ""){ - command += _params.username; - command += "@"; - } - - command += _params.hostname; - command += " \"bkill " ; - command += oss.str(); - command += "\""; - SCRUTE(command.c_str()); - status = system(command.c_str()); - if(status) - throw SALOME_Exception("Error of connection on remote host"); - - MESSAGE("jobId = " << jobid << "killed"); - END_OF("BatchManager_SLURM::deleteJob"); - } - - // Methode pour le controle des jobs : renvoie l'etat du job - string BatchManager_SLURM::queryJob(const int & jobid) - { - BEGIN_OF("BatchManager_SLURM::queryJob"); - // define name of log file - string logFile="/tmp/logs/"; - logFile += getenv("USER"); - logFile += "/batchSalome_"; - - srand ( time(NULL) ); - int ir = rand(); - ostringstream oss; - oss << ir; - logFile += oss.str(); - logFile += ".log"; - - string command; - int status; - - // define command to submit batch - if( _params.protocol == "rsh" ) - command = "rsh "; - else if( _params.protocol == "ssh" ) - command = "ssh "; - else - throw SALOME_Exception("Unknown protocol"); - - if (_params.username != ""){ - command += _params.username; - command += "@"; - } - - command += _params.hostname; - command += " \"bjobs " ; - ostringstream oss2; - oss2 << jobid; - command += oss2.str(); - command += "\" > "; - command += logFile; - SCRUTE(command.c_str()); - status = system(command.c_str()); - if(status) - throw SALOME_Exception("Error of connection on remote host"); - - // read staus of job in log file - char line[128]; - ifstream fp(logFile.c_str(),ios::in); - fp.getline(line,80,'\n'); - - string sjobid, username, jstatus; - fp >> sjobid; - fp >> username; - fp >> jstatus; - - MESSAGE("jobId = " << jobid << " " << jstatus); - END_OF("BatchManager_SLURM::queryJob"); - return jstatus; - } - - void BatchManager_SLURM::buildSalomeCouplingScript(BatchLight::Job* job) throw(SALOME_Exception) - { - BEGIN_OF("BatchManager_SLURM::buildSalomeCouplingScript"); - int status; - const char *fileToExecute = job->getFileToExecute(); - const std::string dirForTmpFiles = job->getDirForTmpFiles(); - - string::size_type p1 = string(fileToExecute).find_last_of("/"); - string::size_type p2 = string(fileToExecute).find_last_of("."); - std::string fileNameToExecute = string(fileToExecute).substr(p1+1,p2-p1-1); - - std::string TmpFileName = BuildTemporaryFileName(); - ofstream tempOutputFile; - tempOutputFile.open(TmpFileName.c_str(), ofstream::out ); - tempOutputFile << "#! /bin/sh -f" << endl ; - tempOutputFile << "cd " ; - tempOutputFile << _params.applipath << endl ; - tempOutputFile << "export PYTHONPATH=~/" ; - tempOutputFile << dirForTmpFiles ; - tempOutputFile << ":$PYTHONPATH" << endl ; - tempOutputFile << "if test $SLURM_PROCID = 0; then" << endl ; - tempOutputFile << " ./runAppli --terminal --modules=" ; - for ( int i = 0 ; i < _params.modulesList.size() ; i++ ) { - tempOutputFile << _params.modulesList[i] ; - if ( i != _params.modulesList.size()-1 ) - tempOutputFile << "," ; - } - tempOutputFile << " --standalone=registry,study,moduleCatalog --killall &" << endl ; - tempOutputFile << " for ((ip=1; ip < ${SLURM_NPROCS} ; ip++))" << endl; - tempOutputFile << " do" << endl ; - tempOutputFile << " arglist=\"$arglist YACS_Server_\"$ip" << endl ; - tempOutputFile << " done" << endl ; - tempOutputFile << " ./runSession waitNS.sh" << endl ; - tempOutputFile << " ./runSession waitContainers.py $arglist" << endl ; - tempOutputFile << " ./runSession python ~/" << dirForTmpFiles << "/" << fileNameToExecute << ".py" << endl; - tempOutputFile << " ./runSession killCurrentPort" << endl; - tempOutputFile << "else" << endl ; - tempOutputFile << " ./runSession waitNS.sh" << endl ; - tempOutputFile << " ./runSession SALOME_Container 'YACS_Server_'${SLURM_PROCID}" << endl ; - tempOutputFile << "fi" << endl ; - tempOutputFile.flush(); - tempOutputFile.close(); - chmod(TmpFileName.c_str(), 0x1ED); - SCRUTE(TmpFileName.c_str()) ; - - string command; - if( _params.protocol == "rsh" ) - command = "rcp "; - else if( _params.protocol == "ssh" ) - command = "scp "; - else - throw SALOME_Exception("Unknown protocol"); - - command += TmpFileName; - command += " "; - if (_params.username != ""){ - command += _params.username; - command += "@"; - } - command += _params.hostname; - command += ":"; - command += dirForTmpFiles ; - command += "/runSalome_" ; - command += fileNameToExecute ; - command += "_Batch.sh" ; - SCRUTE(command.c_str()); - status = system(command.c_str()); - if(status) - throw SALOME_Exception("Error of connection on remote host"); - RmTmpFile(TmpFileName); - - END_OF("BatchManager_SLURM::buildSalomeCouplingScript"); - } - - void BatchManager_SLURM::buildSalomeBatchScript(BatchLight::Job* job) throw(SALOME_Exception) - { - BEGIN_OF("BatchManager_SLURM::buildSalomeBatchScript"); - int status; - const int nbproc = job->getNbProc(); - const std::string dirForTmpFiles = job->getDirForTmpFiles(); - std::string TmpFileName = BuildTemporaryFileName(); - ofstream tempOutputFile; - tempOutputFile.open(TmpFileName.c_str(), ofstream::out ); - const char *fileToExecute = job->getFileToExecute(); - string::size_type p1 = string(fileToExecute).find_last_of("/"); - string::size_type p2 = string(fileToExecute).find_last_of("."); - std::string fileNameToExecute = string(fileToExecute).substr(p1+1,p2-p1-1); - - tempOutputFile << "#! /bin/sh -f" << endl ; - tempOutputFile << "#BSUB -n " << nbproc << endl ; - tempOutputFile << "#BSUB -o " << dirForTmpFiles << "/runSalome.log%J" << endl ; - tempOutputFile << "srun ~/" << dirForTmpFiles << "/runSalome_" << fileNameToExecute << "_Batch.sh" << endl ; - tempOutputFile.flush(); - tempOutputFile.close(); - chmod(TmpFileName.c_str(), 0x1ED); - SCRUTE(TmpFileName.c_str()) ; - - string command; - if( _params.protocol == "rsh" ) - command = "rcp "; - else if( _params.protocol == "ssh" ) - command = "scp "; - else - throw SALOME_Exception("Unknown protocol"); - command += TmpFileName; - command += " "; - if (_params.username != ""){ - command += _params.username; - command += "@"; - } - command += _params.hostname; - command += ":"; - command += dirForTmpFiles ; - command += "/" ; - command += fileNameToExecute ; - command += "_Batch.sh" ; - SCRUTE(command.c_str()); - status = system(command.c_str()); - if(status) - throw SALOME_Exception("Error of connection on remote host"); - - RmTmpFile(TmpFileName); - END_OF("BatchManager_SLURM::buildSalomeBatchScript"); - - } - - int BatchManager_SLURM::submit(BatchLight::Job* job) throw(SALOME_Exception) - { - BEGIN_OF("BatchManager_SLURM::submit"); - const std::string dirForTmpFiles = job->getDirForTmpFiles(); - const char *fileToExecute = job->getFileToExecute(); - string::size_type p1 = string(fileToExecute).find_last_of("/"); - string::size_type p2 = string(fileToExecute).find_last_of("."); - std::string fileNameToExecute = string(fileToExecute).substr(p1+1,p2-p1-1); - - // define name of log file - string logFile="/tmp/logs/"; - logFile += getenv("USER"); - logFile += "/batchSalome_"; - - srand ( time(NULL) ); - int ir = rand(); - ostringstream oss; - oss << ir; - logFile += oss.str(); - logFile += ".log"; - - string command; - int status; - - // define command to submit batch - if( _params.protocol == "rsh" ) - command = "rsh "; - else if( _params.protocol == "ssh" ) - command = "ssh "; - else - throw SALOME_Exception("Unknown protocol"); - - if (_params.username != ""){ - command += _params.username; - command += "@"; - } - - command += _params.hostname; - command += " \"bsub < " ; - command += dirForTmpFiles ; - command += "/" ; - command += fileNameToExecute ; - command += "_Batch.sh\" > "; - command += logFile; - SCRUTE(command.c_str()); - status = system(command.c_str()); - if(status) - throw SALOME_Exception("Error of connection on remote host"); - - // read id of submitted job in log file - char line[128]; - FILE *fp = fopen(logFile.c_str(),"r"); - fgets( line, 128, fp); - fclose(fp); - - string sline(line); - int p10 = sline.find("<"); - int p20 = sline.find(">"); - string strjob = sline.substr(p10+1,p20-p10-1); - - int id; - istringstream iss(strjob); - iss >> id; - - END_OF("BatchManager_SLURM::submit"); - return id; - } - -} diff --git a/src/Launcher/BatchLight_Job.cxx b/src/Launcher/BatchLight_Job.cxx deleted file mode 100644 index ac818c4c7..000000000 --- a/src/Launcher/BatchLight_Job.cxx +++ /dev/null @@ -1,160 +0,0 @@ -// Copyright (C) 2005 OPEN CASCADE, EADS/CCR, LIP6, CEA/DEN, -// CEDRAT, EDF R&D, LEG, PRINCIPIA R&D, BUREAU VERITAS -// -// This library is free software; you can redistribute it and/or -// modify it under the terms of the GNU Lesser General Public -// License as published by the Free Software Foundation; either -// version 2.1 of the License. -// -// This library is distributed in the hope that it will be useful -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -// Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public -// License along with this library; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -// -// See http://www.salome-platform.org/ or email : webmaster.salome@opencascade.com -// -/* - * Job.cxx : - * - * Auteur : Bernard SECHER - CEA/DEN - * Date : Juillet 2007 - * Projet : SALOME - * - */ - -#include "BatchLight_Job.hxx" -#include - -using namespace std; -using namespace BatchLight; - -Job::Job(const char *fileToExecute, - const Engines::FilesList& filesToExport, - const Engines::FilesList& filesToImport, - const Engines::BatchParameters& batch_params) : _fileToExecute(fileToExecute), - _filesToExport(filesToExport), - _filesToImport(filesToImport), - _batch_params(batch_params) -{ - _dirForTmpFiles = "/tmp/default_batch_tmp_directory"; - _home=""; -} - -Job::~Job() -{ - MESSAGE("Job destructor"); -} - -void -Job::addFileToImportList(std::string file_name) -{ - CORBA::ULong lgth = _filesToImport.length(); - _filesToImport.length(lgth+1); - _filesToImport[lgth] = CORBA::string_dup(file_name.c_str()); -} - -const std::string -Job::getExpectedDuringTime() -{ - std::string str(_batch_params.expected_during_time); - return str; -} - -const std::string -Job::getMemory() -{ - std::string str(_batch_params.mem); - return str; -} - -bool -Job::check() { - bool rtn = true; - INFOS("Warning : batch_directory option is not currently implemented"); - INFOS("Warning : currently these informations are only in the PBS batch manager"); - INFOS("Job parameters are :"); - INFOS("Directory : $HOME/Batch/$date"); - - // check expected_during_time (check the format) - std::string edt_info; - std::string edt_value = _batch_params.expected_during_time.in(); - if (edt_value != "") { - std::string begin_edt_value = edt_value.substr(0, 2); - std::string mid_edt_value = edt_value.substr(2, 1); - std::string end_edt_value = edt_value.substr(3); - - long value; - std::istringstream iss(begin_edt_value); - if (!(iss >> value)) { - edt_info = "Error on definition ! : " + edt_value; - rtn = false; - } - else if (value < 0) { - edt_info = "Error on definition time is negative ! : " + value; - rtn = false; - } - std::istringstream iss_2(end_edt_value); - if (!(iss_2 >> value)) { - edt_info = "Error on definition ! : " + edt_value; - rtn = false; - } - else if (value < 0) { - edt_info = "Error on definition time is negative ! : " + value; - rtn = false; - } - if (mid_edt_value != ":") { - edt_info = "Error on definition ! :" + edt_value; - rtn = false; - } - } - else { - edt_info = "No value given"; - } - INFOS("Expected during time : " << edt_info); - - // check memory (check the format) - std::string mem_info; - std::string mem_value = _batch_params.mem.in(); - if (mem_value != "") { - std::string begin_mem_value = mem_value.substr(0, mem_value.length()-2); - long re_mem_value; - std::istringstream iss(begin_mem_value); - if (!(iss >> re_mem_value)) { - mem_info = "Error on definition ! : " + mem_value; - rtn = false; - } - else if (re_mem_value <= 0) { - mem_info = "Error on definition memory is negative ! : " + mem_value; - rtn = false; - } - std::string end_mem_value = mem_value.substr(mem_value.length()-2); - if (end_mem_value != "gb" and end_mem_value != "mb") { - mem_info = "Error on definition, type is bad ! " + mem_value; - rtn = false; - } - } - else { - mem_info = "No value given"; - } - INFOS("Memory : " << mem_info); - - // check nb_proc - std::string nb_proc_info; - ostringstream nb_proc_value; - nb_proc_value << _batch_params.nb_proc; - if(_batch_params.nb_proc <= 0) { - nb_proc_info = "Bad value ! nb_proc = "; - nb_proc_info += nb_proc_value.str(); - rtn = false; - } - else { - nb_proc_info = nb_proc_value.str(); - } - INFOS("Nb of processors : " << nb_proc_info); - - return rtn; -} diff --git a/src/Launcher/BatchLight_Job.hxx b/src/Launcher/BatchLight_Job.hxx deleted file mode 100644 index 94cdd6a45..000000000 --- a/src/Launcher/BatchLight_Job.hxx +++ /dev/null @@ -1,75 +0,0 @@ -// Copyright (C) 2005 OPEN CASCADE, EADS/CCR, LIP6, CEA/DEN, -// CEDRAT, EDF R&D, LEG, PRINCIPIA R&D, BUREAU VERITAS -// -// This library is free software; you can redistribute it and/or -// modify it under the terms of the GNU Lesser General Public -// License as published by the Free Software Foundation; either -// version 2.1 of the License. -// -// This library is distributed in the hope that it will be useful -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -// Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public -// License along with this library; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -// -// See http://www.salome-platform.org/ or email : webmaster.salome@opencascade.com -// -/* - * Job.hxx : - * - * Auteur : Bernard SECHER - CEA/DEN - * Date : Juillet 2007 - * Projet : SALOME - * - */ - -#ifndef _BL_JOB_H_ -#define _BL_JOB_H_ - -#include "utilities.h" -#include -#include CORBA_CLIENT_HEADER(SALOME_ContainerManager) - -namespace BatchLight { - - class Job - { - public: - // Constructeurs et destructeur - Job(const char *fileToExecute, - const Engines::FilesList& filesToExport, - const Engines::FilesList& filesToImport, - const Engines::BatchParameters& batch_params); - virtual ~Job(); - - const char *getFileToExecute() const { return _fileToExecute; } - const Engines::FilesList getFilesToExportList() const { return _filesToExport; } - const Engines::FilesList getFilesToImportList() const { return _filesToImport; } - void addFileToImportList(std::string file_name); - const CORBA::Long getNbProc() const { return _batch_params.nb_proc; } - const std::string getExpectedDuringTime(); - const std::string getMemory(); - - const std::string getDirForTmpFiles() const { return _dirForTmpFiles;} - void setDirForTmpFiles(std::string dirForTmpFiles) {_dirForTmpFiles = dirForTmpFiles; - SCRUTE(_dirForTmpFiles);} - void setHomeDir(std::string home) {_home = home;SCRUTE(_dirForTmpFiles);} - const std::string getHomeDir() {return _home;} - bool check(); - protected: - const char* _fileToExecute; - const Engines::FilesList _filesToExport; - Engines::FilesList _filesToImport; - Engines::BatchParameters _batch_params; - std::string _dirForTmpFiles; // Tmp directory on the server - std::string _home; // Home directory on the server - private: - - }; - -} - -#endif diff --git a/src/Launcher/Launcher.cxx b/src/Launcher/Launcher.cxx new file mode 100644 index 000000000..5caf6a4ce --- /dev/null +++ b/src/Launcher/Launcher.cxx @@ -0,0 +1,625 @@ +// Copyright (C) 2005 OPEN CASCADE, EADS/CCR, LIP6, CEA/DEN, +// CEDRAT, EDF R&D, LEG, PRINCIPIA R&D, BUREAU VERITAS +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License. +// +// This library is distributed in the hope that it will be useful +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// +// See http://www.salome-platform.org/ or email : webmaster.salome@opencascade.com +// +#include "Batch_Date.hxx" +#include "Batch_FactBatchManager_eLSF.hxx" +#include "Batch_FactBatchManager_ePBS.hxx" +#include "Launcher.hxx" +#include +#include +#include + +using namespace std; + +//============================================================================= +/*! + * Constructor + * \param orb + * Define a CORBA single thread policy for the server, which avoid to deal + * with non thread-safe usage like Change_Directory in SALOME naming service + */ +//============================================================================= + +Launcher_cpp::Launcher_cpp() +{ + cerr << "Launcher_cpp constructor" << endl; +} + +//============================================================================= +/*! + * destructor + */ +//============================================================================= + +Launcher_cpp::~Launcher_cpp() +{ + cerr << "Launcher_cpp destructor" << endl; + std::map < string, Batch::BatchManager_eClient * >::const_iterator it1; + for(it1=_batchmap.begin();it1!=_batchmap.end();it1++) + delete it1->second; + std::map < std::pair , Batch::Job* >::const_iterator it2; + for(it2=_jobmap.begin();it2!=_jobmap.end();it2++) + delete it2->second; +} + +//============================================================================= +/*! CORBA Method: + * Submit a batch job on a cluster and returns the JobId + * \param fileToExecute : .py/.exe/.sh/... to execute on the batch cluster + * \param filesToExport : to export on the batch cluster + * \param NumberOfProcessors : Number of processors needed on the batch cluster + * \param params : Constraints for the choice of the batch cluster + */ +//============================================================================= +long Launcher_cpp::submitSalomeJob( const string fileToExecute , + const vector& filesToExport , + const vector& filesToImport , + const batchParams& batch_params, + const machineParams& params) throw(LauncherException) +{ + cerr << "BEGIN OF Launcher_cpp::submitSalomeJob" << endl; + long jobId; + vector aMachineList; + + // check batch params + if ( !check(batch_params) ) + throw LauncherException("Batch parameters are bad (see informations above)"); + + // find a cluster matching the structure params + vector aCompoList ; + try{ + aMachineList = _ResManager->GetFittingResources(params, aCompoList); + } + catch(const ResourcesException &ex){ + throw LauncherException(ex.msg.c_str()); + } + if (aMachineList.size() == 0) + throw LauncherException("No resources have been found with your parameters"); + + ParserResourcesType p = _ResManager->GetResourcesList(aMachineList[0]); + string clustername(p.Alias); + cerr << "Choose cluster: " << clustername << endl; + + // search batch manager for that cluster in map or instanciate one + map < string, Batch::BatchManager_eClient * >::const_iterator it = _batchmap.find(clustername); + if(it == _batchmap.end()) + { + _batchmap[clustername] = FactoryBatchManager(p); + // TODO: Add a test for the cluster ! + } + + try{ + // tmp directory on cluster to put files to execute + string tmpdir = getTmpDirForBatchFiles(); + + // create and submit job on cluster + Batch::Parametre param; + param[USER] = p.UserName; + param[EXECUTABLE] = buildSalomeCouplingScript(fileToExecute,tmpdir,p); + param[INFILE] = Batch::Couple( fileToExecute, getRemoteFile(tmpdir,fileToExecute) ); + for(int i=0;isubmitJob(*job); + + // get job id in long + istringstream iss(jid.getReference()); + iss >> jobId; + + _jobmap[ pair(clustername,jobId) ] = job; + } + catch(const Batch::EmulationException &ex){ + throw LauncherException(ex.msg.c_str()); + } + + return jobId; +} + +//============================================================================= +/*! CORBA Method: + * Query a batch job on a cluster and returns the status of job + * \param jobId : identification of Salome job + * \param params : Constraints for the choice of the batch cluster + */ +//============================================================================= +string Launcher_cpp::querySalomeJob( long id, + const machineParams& params) throw(LauncherException) +{ + // find a cluster matching params structure + vector aCompoList ; + vector aMachineList = _ResManager->GetFittingResources( params , aCompoList ) ; + ParserResourcesType p = _ResManager->GetResourcesList(aMachineList[0]); + string clustername(p.Alias); + + // search batch manager for that cluster in map + std::map < string, Batch::BatchManager_eClient * >::const_iterator it = _batchmap.find(clustername); + if(it == _batchmap.end()) + throw LauncherException("no batchmanager for that cluster"); + + ostringstream oss; + oss << id; + Batch::JobId jobId( _batchmap[clustername], oss.str() ); + + Batch::JobInfo jinfo = jobId.queryJob(); + Batch::Parametre par = jinfo.getParametre(); + return par[STATE]; +} + +//============================================================================= +/*! CORBA Method: + * Delete a batch job on a cluster + * \param jobId : identification of Salome job + * \param params : Constraints for the choice of the batch cluster + */ +//============================================================================= +void Launcher_cpp::deleteSalomeJob( const long id, + const machineParams& params) throw(LauncherException) +{ + // find a cluster matching params structure + vector aCompoList ; + vector aMachineList = _ResManager->GetFittingResources( params , aCompoList ) ; + ParserResourcesType p = _ResManager->GetResourcesList(aMachineList[0]); + string clustername(p.Alias); + + // search batch manager for that cluster in map + map < string, Batch::BatchManager_eClient * >::const_iterator it = _batchmap.find(clustername); + if(it == _batchmap.end()) + throw LauncherException("no batchmanager for that cluster"); + + ostringstream oss; + oss << id; + Batch::JobId jobId( _batchmap[clustername], oss.str() ); + + jobId.deleteJob(); +} + +//============================================================================= +/*! CORBA Method: + * Get result files of job on a cluster + * \param jobId : identification of Salome job + * \param params : Constraints for the choice of the batch cluster + */ +//============================================================================= +void Launcher_cpp::getResultSalomeJob( const string directory, + const long id, + const machineParams& params) throw(LauncherException) +{ + vector aCompoList ; + vector aMachineList = _ResManager->GetFittingResources( params , aCompoList ) ; + ParserResourcesType p = _ResManager->GetResourcesList(aMachineList[0]); + string clustername(p.Alias); + + // search batch manager for that cluster in map + map < string, Batch::BatchManager_eClient * >::const_iterator it = _batchmap.find(clustername); + if(it == _batchmap.end()) + throw LauncherException("no batchmanager for that cluster"); + + Batch::Job* job = _jobmap[ pair(clustername,id) ]; + + _batchmap[clustername]->importOutputFiles( *job, directory ); +} + +//============================================================================= +/*! + * Factory to instanciate the good batch manager for choosen cluster. + */ +//============================================================================= + +Batch::BatchManager_eClient *Launcher_cpp::FactoryBatchManager( const ParserResourcesType& params ) throw(LauncherException) +{ + + std::string hostname, protocol, mpi; + Batch::FactBatchManager_eClient* fact; + + hostname = params.Alias; + switch(params.Protocol){ + case rsh: + protocol = "rsh"; + break; + case ssh: + protocol = "ssh"; + break; + default: + throw LauncherException("unknown protocol"); + break; + } + switch(params.mpi){ + case lam: + mpi = "lam"; + break; + case mpich1: + mpi = "mpich1"; + break; + case mpich2: + mpi = "mpich2"; + break; + case openmpi: + mpi = "openmpi"; + break; + case slurm: + mpi = "slurm"; + break; + default: + mpi = "indif"; + break; + } + cerr << "Instanciation of batch manager" << endl; + switch( params.Batch ){ + case pbs: + cerr << "Instantiation of PBS batch manager" << endl; + fact = new Batch::FactBatchManager_ePBS; + break; + case lsf: + cerr << "Instantiation of LSF batch manager" << endl; + fact = new Batch::FactBatchManager_eLSF; + break; + default: + cerr << "BATCH = " << params.Batch << endl; + throw LauncherException("no batchmanager for that cluster"); + } + return (*fact)(hostname.c_str(),protocol.c_str(),mpi.c_str()); +} + +string Launcher_cpp::buildSalomeCouplingScript(const string fileToExecute, const string dirForTmpFiles, const ParserResourcesType& params) +{ + int idx = dirForTmpFiles.find("Batch/"); + std::string filelogtemp = dirForTmpFiles.substr(idx+6, dirForTmpFiles.length()); + + string::size_type p1 = fileToExecute.find_last_of("/"); + string::size_type p2 = fileToExecute.find_last_of("."); + std::string fileNameToExecute = fileToExecute.substr(p1+1,p2-p1-1); + std::string TmpFileName = "/tmp/runSalome_" + fileNameToExecute + ".sh"; + + MpiImpl* mpiImpl = FactoryMpiImpl(params.mpi); + + ofstream tempOutputFile; + tempOutputFile.open(TmpFileName.c_str(), ofstream::out ); + + // Begin + tempOutputFile << "#! /bin/sh -f" << endl ; + tempOutputFile << "cd " ; + tempOutputFile << params.AppliPath << endl ; + tempOutputFile << "export SALOME_BATCH=1\n"; + tempOutputFile << "export PYTHONPATH=~/" ; + tempOutputFile << dirForTmpFiles ; + tempOutputFile << ":$PYTHONPATH" << endl ; + + // Test node rank + tempOutputFile << "if test " ; + tempOutputFile << mpiImpl->rank() ; + tempOutputFile << " = 0; then" << endl ; + + // ----------------------------------------------- + // Code for rank 0 : launch runAppli and a container + // RunAppli + tempOutputFile << " ./runAppli --terminal --modules=" ; + for ( int i = 0 ; i < params.ModulesList.size() ; i++ ) { + tempOutputFile << params.ModulesList[i] ; + if ( i != params.ModulesList.size()-1 ) + tempOutputFile << "," ; + } + tempOutputFile << " --standalone=registry,study,moduleCatalog --ns-port-log=" + << filelogtemp + << " &\n"; + + // Wait NamingService + tempOutputFile << " current=0\n" + << " stop=20\n" + << " while ! test -f " << filelogtemp << "\n" + << " do\n" + << " sleep 2\n" + << " let current=current+1\n" + << " if [ \"$current\" -eq \"$stop\" ] ; then\n" + << " echo Error Naming Service failed ! >&2" + << " exit\n" + << " fi\n" + << " done\n" + << " port=`cat " << filelogtemp << "`\n"; + + // Wait other containers + tempOutputFile << " for ((ip=1; ip < "; + tempOutputFile << mpiImpl->size(); + tempOutputFile << " ; ip++))" << endl; + tempOutputFile << " do" << endl ; + tempOutputFile << " arglist=\"$arglist YACS_Server_\"$ip" << endl ; + tempOutputFile << " done" << endl ; + tempOutputFile << " sleep 5" << endl ; + tempOutputFile << " ./runSession waitContainers.py $arglist" << endl ; + + // Launch user script + tempOutputFile << " ./runSession python ~/" << dirForTmpFiles << "/" << fileNameToExecute << ".py" << endl; + + // Stop application + tempOutputFile << " rm " << filelogtemp << "\n" + << " ./runSession shutdownSalome.py" << endl; + + // ------------------------------------- + // Other nodes launch a container + tempOutputFile << "else" << endl ; + + // Wait NamingService + tempOutputFile << " current=0\n" + << " stop=20\n" + << " while ! test -f " << filelogtemp << "\n" + << " do\n" + << " sleep 2\n" + << " let current=current+1\n" + << " if [ \"$current\" -eq \"$stop\" ] ; then\n" + << " echo Error Naming Service failed ! >&2" + << " exit\n" + << " fi\n" + << " done\n" + << " port=`cat " << filelogtemp << "`\n"; + + // Launching container + tempOutputFile << " ./runSession SALOME_Container YACS_Server_"; + tempOutputFile << mpiImpl->rank() + << " > ~/" << dirForTmpFiles << "/YACS_Server_" + << mpiImpl->rank() << "_container_log." << filelogtemp + << " 2>&1\n"; + tempOutputFile << "fi" << endl ; + tempOutputFile.flush(); + tempOutputFile.close(); + chmod(TmpFileName.c_str(), 0x1ED); + cerr << TmpFileName.c_str() << endl; + + delete mpiImpl; + + return TmpFileName; + +} + +MpiImpl *Launcher_cpp::FactoryMpiImpl(MpiImplType mpi) throw(LauncherException) +{ + switch(mpi){ + case lam: + return new MpiImpl_LAM(); + case mpich1: + return new MpiImpl_MPICH1(); + case mpich2: + return new MpiImpl_MPICH2(); + case openmpi: + return new MpiImpl_OPENMPI(); + case slurm: + return new MpiImpl_SLURM(); + case indif: + throw LauncherException("you must specify a mpi implementation in CatalogResources.xml file"); + default: + ostringstream oss; + oss << mpi << " : not yet implemented"; + throw LauncherException(oss.str().c_str()); + } + +} + +string Launcher_cpp::getTmpDirForBatchFiles() +{ + string ret; + string thedate; + + // Adding date to the directory name + Batch::Date date = Batch::Date(time(0)); + thedate = date.str(); + int lend = thedate.size() ; + int i = 0 ; + while ( i < lend ) { + if ( thedate[i] == '/' || thedate[i] == '-' || thedate[i] == ':' ) { + thedate[i] = '_' ; + } + i++ ; + } + + ret = string("Batch/"); + ret += thedate; + return ret; +} + +string Launcher_cpp::getRemoteFile( std::string remoteDir, std::string localFile ) +{ + string::size_type pos = localFile.find_last_of("/") + 1; + int ln = localFile.length() - pos; + string remoteFile = remoteDir + "/" + localFile.substr(pos,ln); + return remoteFile; +} + +bool Launcher_cpp::check(const batchParams& batch_params) +{ + bool rtn = true; + cerr << "Job parameters are :" << endl; + cerr << "Directory : $HOME/Batch/$date" << endl; + + // check expected_during_time (check the format) + std::string edt_info = batch_params.expected_during_time; + std::string edt_value = batch_params.expected_during_time; + if (edt_value != "") { + std::string begin_edt_value = edt_value.substr(0, 2); + std::string mid_edt_value = edt_value.substr(2, 1); + std::string end_edt_value = edt_value.substr(3); + + long value; + std::istringstream iss(begin_edt_value); + if (!(iss >> value)) { + edt_info = "Error on definition ! : " + edt_value; + rtn = false; + } + else if (value < 0) { + edt_info = "Error on definition time is negative ! : " + value; + rtn = false; + } + std::istringstream iss_2(end_edt_value); + if (!(iss_2 >> value)) { + edt_info = "Error on definition ! : " + edt_value; + rtn = false; + } + else if (value < 0) { + edt_info = "Error on definition time is negative ! : " + value; + rtn = false; + } + if (mid_edt_value != ":") { + edt_info = "Error on definition ! :" + edt_value; + rtn = false; + } + } + else { + edt_info = "No value given"; + } + cerr << "Expected during time : " << edt_info << endl;; + + // check memory (check the format) + std::string mem_info; + std::string mem_value = batch_params.mem; + if (mem_value != "") { + std::string begin_mem_value = mem_value.substr(0, mem_value.length()-2); + long re_mem_value; + std::istringstream iss(begin_mem_value); + if (!(iss >> re_mem_value)) { + mem_info = "Error on definition ! : " + mem_value; + rtn = false; + } + else if (re_mem_value <= 0) { + mem_info = "Error on definition memory is negative ! : " + mem_value; + rtn = false; + } + std::string end_mem_value = mem_value.substr(mem_value.length()-2); + if (end_mem_value != "gb" and end_mem_value != "mb") { + mem_info = "Error on definition, type is bad ! " + mem_value; + rtn = false; + } + } + else { + mem_info = "No value given"; + } + cerr << "Memory : " << mem_info << endl; + + // check nb_proc + std::string nb_proc_info; + ostringstream nb_proc_value; + nb_proc_value << batch_params.nb_proc; + if(batch_params.nb_proc <= 0) { + nb_proc_info = "Bad value ! nb_proc = "; + nb_proc_info += nb_proc_value.str(); + rtn = false; + } + else { + nb_proc_info = nb_proc_value.str(); + } + cerr << "Nb of processors : " << nb_proc_info << endl; + + return rtn; +} + +long Launcher_cpp::getWallTime(std::string edt) +{ + long hh, mm, ret; + + if( edt.size() == 0 ) + return 0; + + string::size_type pos = edt.find(":"); + string h = edt.substr(0,pos); + string m = edt.substr(pos+1,edt.size()-pos+1); + istringstream issh(h); + issh >> hh; + istringstream issm(m); + issm >> mm; + ret = hh*60 + mm; + return ret; +} + +long Launcher_cpp::getRamSize(std::string mem) +{ + long mv; + + if( mem.size() == 0 ) + return 0; + + string ram = mem.substr(0,mem.size()-2); + istringstream iss(ram); + iss >> mv; + string unity = mem.substr(mem.size()-2,2); + if( (unity.find("gb") != string::npos) || (unity.find("GB") != string::npos) ) + return mv*1024; + else if( (unity.find("mb") != string::npos) || (unity.find("MB") != string::npos) ) + return mv; + else if( (unity.find("kb") != string::npos) || (unity.find("KB") != string::npos) ) + return mv/1024; + else if( (unity.find("b") != string::npos) || (unity.find("B") != string::npos) ) + return mv/(1024*1024); + else + return 0; +} + +std::string +Launcher_cpp::getHomeDir(const ParserResourcesType& p, const std::string& tmpdir) +{ + std::string home; + std::string command; + int idx = tmpdir.find("Batch/"); + std::string filelogtemp = tmpdir.substr(idx+6, tmpdir.length()); + filelogtemp = "/tmp/logs" + filelogtemp + "_home"; + + if( p.Protocol == rsh ) + command = "rsh "; + else if( p.Protocol == ssh ) + command = "ssh "; + else + throw LauncherException("Unknown protocol"); + if (p.UserName != ""){ + command += p.UserName; + command += "@"; + } + command += p.Alias; + command += " 'echo $HOME' > "; + command += filelogtemp; + std::cerr << command.c_str() << std::endl; + int status = system(command.c_str()); + if(status) + throw LauncherException("Error of launching home command on remote host"); + + std::ifstream file_home(filelogtemp.c_str()); + std::getline(file_home, home); + file_home.close(); + return home; +} diff --git a/src/Launcher/Launcher.hxx b/src/Launcher/Launcher.hxx new file mode 100644 index 000000000..84dd59dd5 --- /dev/null +++ b/src/Launcher/Launcher.hxx @@ -0,0 +1,79 @@ +// Copyright (C) 2005 OPEN CASCADE, EADS/CCR, LIP6, CEA/DEN, +// CEDRAT, EDF R&D, LEG, PRINCIPIA R&D, BUREAU VERITAS +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License. +// +// This library is distributed in the hope that it will be useful +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// +// See http://www.salome-platform.org/ or email : webmaster.salome@opencascade.com +// +#ifndef __LAUNCHER_HXX__ +#define __LAUNCHER_HXX__ + +#include "Batch_BatchManager_eClient.hxx" +#include "ResourcesManager.hxx" + +#include + +struct batchParams{ + std::string batch_directory; + std::string expected_during_time; + std::string mem; + unsigned long nb_proc; +}; + +class LauncherException +{ +public: + const std::string msg; + + LauncherException(const std::string m) : msg(m) {} +}; + +class Launcher_cpp +{ + +public: + Launcher_cpp(); + ~Launcher_cpp(); + + long submitSalomeJob(const std::string fileToExecute , + const std::vector& filesToExport , + const std::vector& filesToImport , + const batchParams& batch_params, + const machineParams& params) throw(LauncherException); + + std::string querySalomeJob( const long jobId, const machineParams& params) throw(LauncherException); + void deleteSalomeJob( const long jobId, const machineParams& params) throw(LauncherException); + void getResultSalomeJob( const std::string directory, const long jobId, const machineParams& params ) throw(LauncherException); + + void SetResourcesManager( ResourcesManager_cpp* rm ) { _ResManager = rm; } + +protected: + + std::string buildSalomeCouplingScript(const string fileToExecute, const string dirForTmpFiles, const ParserResourcesType& params); + MpiImpl *FactoryMpiImpl(MpiImplType mpiImpl) throw(LauncherException); + Batch::BatchManager_eClient *FactoryBatchManager( const ParserResourcesType& params ) throw(LauncherException); + std::string getTmpDirForBatchFiles(); + std::string getRemoteFile( std::string remoteDir, std::string localFile ); + std::string getHomeDir(const ParserResourcesType& p, const std::string & tmpdir); + + std::map _batchmap; + std::map < std::pair , Batch::Job* > _jobmap; + ResourcesManager_cpp *_ResManager; + bool check(const batchParams& batch_params); + long getWallTime(std::string edt); + long getRamSize(std::string mem); +}; + +#endif diff --git a/src/Launcher/Makefile.am b/src/Launcher/Makefile.am index 604a5c5df..3ef64a802 100644 --- a/src/Launcher/Makefile.am +++ b/src/Launcher/Makefile.am @@ -36,13 +36,9 @@ include $(top_srcdir)/salome_adm/unix/make_common_starter.am # # header files salomeinclude_HEADERS = \ - BatchLight_BatchManager.hxx \ - BatchLight_BatchManager_PBS.hxx \ - BatchLight_BatchManager_SLURM.hxx \ - BatchLight_Job.hxx \ - MpiImpl.hxx \ BatchLight_BatchTest.hxx \ - SALOME_Launcher.hxx + SALOME_Launcher.hxx \ + Launcher.hxx # Scripts to be installed dist_salomescript_DATA = @@ -96,15 +92,10 @@ COMMON_LIBS =\ # Libraries targets # =============================================================== # -lib_LTLIBRARIES = libSalomeLauncher.la +lib_LTLIBRARIES = libLauncher.la libSalomeLauncher.la libSalomeLauncher_la_SOURCES=\ - SALOME_Launcher.cxx \ - BatchLight_BatchManager.cxx \ - BatchLight_BatchManager_SLURM.cxx \ - BatchLight_BatchManager_PBS.cxx \ - BatchLight_Job.cxx \ - MpiImpl.cxx \ - BatchLight_BatchTest.cxx + BatchLight_BatchTest.cxx \ + SALOME_Launcher.cxx libSalomeLauncher_la_CPPFLAGS =\ $(COMMON_CPPFLAGS) @@ -114,8 +105,24 @@ libSalomeLauncher_la_LDFLAGS =\ @LDEXPDYNFLAGS@ libSalomeLauncher_la_LIBADD =\ - $(COMMON_LIBS) + $(COMMON_LIBS) libLauncher.la +libLauncher_la_SOURCES=\ + Launcher.cxx + +libLauncher_la_CPPFLAGS =\ + -I$(srcdir)/../Batch \ + -I$(srcdir)/../ResourcesManager \ + @MPI_INCLUDES@ \ + @LIBXML_INCLUDES@ + +libLauncher_la_LDFLAGS =\ + -no-undefined -version-info=0:0:0 \ + @LDEXPDYNFLAGS@ + +libLauncher_la_LIBADD =\ + @MPI_LIBS@ \ + @LIBXML_LIBS@ # # =============================================================== diff --git a/src/Launcher/SALOME_Launcher.cxx b/src/Launcher/SALOME_Launcher.cxx index 8120d73f0..5c974c98b 100644 --- a/src/Launcher/SALOME_Launcher.cxx +++ b/src/Launcher/SALOME_Launcher.cxx @@ -17,9 +17,6 @@ // // See http://www.salome-platform.org/ or email : webmaster.salome@opencascade.com // -#include "BatchLight_BatchManager_PBS.hxx" -#include "BatchLight_BatchManager_SLURM.hxx" -#include "BatchLight_Job.hxx" #include "SALOME_Launcher.hxx" #include "OpUtil.hxx" #include @@ -28,9 +25,6 @@ #endif #include #include "Utils_CorbaException.hxx" -#include "Batch_Date.hxx" - -#define TIME_OUT_TO_LAUNCH_CONT 21 using namespace std; @@ -45,11 +39,12 @@ const char *SALOME_Launcher::_LauncherNameInNS = "/SalomeLauncher"; */ //============================================================================= -SALOME_Launcher::SALOME_Launcher(CORBA::ORB_ptr orb, PortableServer::POA_var poa) +SALOME_Launcher::SALOME_Launcher(CORBA::ORB_ptr orb, PortableServer::POA_var poa) : _l() { - MESSAGE("constructor"); + MESSAGE("SALOME_Launcher constructor"); _NS = new SALOME_NamingService(orb); _ResManager = new SALOME_ResourcesManager(orb,poa,_NS); + _l.SetResourcesManager(_ResManager->GetImpl()); _ContManager = new SALOME_ContainerManager(orb,poa,_ResManager,_NS); _ResManager->_remove_ref(); _ContManager->_remove_ref(); @@ -61,7 +56,7 @@ SALOME_Launcher::SALOME_Launcher(CORBA::ORB_ptr orb, PortableServer::POA_var poa Engines::SalomeLauncher_var refContMan = Engines::SalomeLauncher::_narrow(obj); _NS->Register(refContMan,_LauncherNameInNS); - MESSAGE("constructor end"); + MESSAGE("SALOME_Launcher constructor end"); } //============================================================================= @@ -74,9 +69,6 @@ SALOME_Launcher::~SALOME_Launcher() { MESSAGE("destructor"); delete _NS; - std::map < string, BatchLight::BatchManager * >::const_iterator it; - for(it=_batchmap.begin();it!=_batchmap.end();it++) - delete it->second; } //============================================================================= @@ -125,34 +117,35 @@ CORBA::Long SALOME_Launcher::submitSalomeJob( const char * fileToExecute , { MESSAGE("BEGIN OF SALOME_Launcher::submitSalomeJob"); CORBA::Long jobId; - try{ - // find a cluster matching the structure params - Engines::CompoList aCompoList ; - Engines::MachineList *aMachineList = _ResManager->GetFittingResources(params, aCompoList); - if (aMachineList->length() == 0) - throw SALOME_Exception("No resources have been found with your parameters"); + + machineParams p; + p.hostname = params.hostname; + p.OS = params.OS; + p.nb_node = params.nb_node; + p.nb_proc_per_node = params.nb_proc_per_node; + p.cpu_clock = params.cpu_clock; + p.mem_mb = params.mem_mb; - const Engines::MachineParameters* p = _ResManager->GetMachineParameters((*aMachineList)[0]); - string clustername(p->alias); - INFOS("Choose cluster" << clustername); + batchParams bp; + bp.batch_directory = batch_params.batch_directory; + bp.expected_during_time = batch_params.expected_during_time; + bp.mem = batch_params.mem; + bp.nb_proc = batch_params.nb_proc; - // search batch manager for that cluster in map or instanciate one - std::map < string, BatchLight::BatchManager * >::const_iterator it = _batchmap.find(clustername); - if(it == _batchmap.end()) - _batchmap[clustername] = FactoryBatchManager(p); - - // create and submit job on cluster - BatchLight::Job* job = new BatchLight::Job(fileToExecute, filesToExport, filesToImport, batch_params); - bool res = job->check(); - if (!res) { - delete job; - throw SALOME_Exception("Job parameters are bad (see informations above)"); - } - jobId = _batchmap[clustername]->submitJob(job); + vector efl; + for(int i=0;i ifl; + for(int i=0;i::const_iterator it = _batchmap.find(clustername); - if(it == _batchmap.end()) - { - _batchmap[clustername] = FactoryBatchManager(p); - } - } - else - { - throw SALOME_Exception("Test of the batch machine failed - see messages in the SALOME_Launcher log"); } } catch(const SALOME_Exception &ex){ @@ -213,23 +196,20 @@ char* SALOME_Launcher::querySalomeJob( const CORBA::Long jobId, const Engines::MachineParameters& params) { string status; + machineParams p; + p.hostname = params.hostname; + p.OS = params.OS; + p.nb_node = params.nb_node; + p.nb_proc_per_node = params.nb_proc_per_node; + p.cpu_clock = params.cpu_clock; + p.mem_mb = params.mem_mb; + try{ - // find a cluster matching params structure - Engines::CompoList aCompoList ; - Engines::MachineList * aMachineList = _ResManager->GetFittingResources( params , aCompoList ) ; - const Engines::MachineParameters* p = _ResManager->GetMachineParameters((*aMachineList)[0]); - string clustername(p->alias); - - // search batch manager for that cluster in map - std::map < string, BatchLight::BatchManager * >::const_iterator it = _batchmap.find(clustername); - if(it == _batchmap.end()) - throw SALOME_Exception("no batchmanager for that cluster"); - - status = _batchmap[clustername]->queryJob(jobId); + status = _l.querySalomeJob(jobId,p); } - catch(const SALOME_Exception &ex){ + catch(const LauncherException &ex){ INFOS("Caught exception."); - THROW_SALOME_CORBA_EXCEPTION(ex.what(),SALOME::BAD_PARAM); + THROW_SALOME_CORBA_EXCEPTION(ex.msg.c_str(),SALOME::BAD_PARAM); } return CORBA::string_dup(status.c_str()); } @@ -244,23 +224,20 @@ char* SALOME_Launcher::querySalomeJob( const CORBA::Long jobId, void SALOME_Launcher::deleteSalomeJob( const CORBA::Long jobId, const Engines::MachineParameters& params) { + machineParams p; + p.hostname = params.hostname; + p.OS = params.OS; + p.nb_node = params.nb_node; + p.nb_proc_per_node = params.nb_proc_per_node; + p.cpu_clock = params.cpu_clock; + p.mem_mb = params.mem_mb; + try{ - // find a cluster matching params structure - Engines::CompoList aCompoList ; - Engines::MachineList *aMachineList = _ResManager->GetFittingResources( params , aCompoList ) ; - const Engines::MachineParameters* p = _ResManager->GetMachineParameters((*aMachineList)[0]); - string clustername(p->alias); - - // search batch manager for that cluster in map - std::map < string, BatchLight::BatchManager * >::const_iterator it = _batchmap.find(clustername); - if(it == _batchmap.end()) - throw SALOME_Exception("no batchmanager for that cluster"); - - _batchmap[clustername]->deleteJob(jobId); + _l.deleteSalomeJob(jobId,p); } - catch(const SALOME_Exception &ex){ + catch(const LauncherException &ex){ INFOS("Caught exception."); - THROW_SALOME_CORBA_EXCEPTION(ex.what(),SALOME::BAD_PARAM); + THROW_SALOME_CORBA_EXCEPTION(ex.msg.c_str(),SALOME::BAD_PARAM); } } @@ -275,54 +252,20 @@ void SALOME_Launcher::getResultSalomeJob( const char *directory, const CORBA::Long jobId, const Engines::MachineParameters& params) { + machineParams p; + p.hostname = params.hostname; + p.OS = params.OS; + p.nb_node = params.nb_node; + p.nb_proc_per_node = params.nb_proc_per_node; + p.cpu_clock = params.cpu_clock; + p.mem_mb = params.mem_mb; + try{ - // find a cluster matching params structure - Engines::CompoList aCompoList ; - Engines::MachineList *aMachineList = _ResManager->GetFittingResources( params , aCompoList ) ; - const Engines::MachineParameters* p = _ResManager->GetMachineParameters((*aMachineList)[0]); - string clustername(p->alias); - - // search batch manager for that cluster in map - std::map < string, BatchLight::BatchManager * >::const_iterator it = _batchmap.find(clustername); - if(it == _batchmap.end()) - throw SALOME_Exception("no batchmanager for that cluster"); - - _batchmap[clustername]->importOutputFiles( directory, jobId ); + _l.getResultSalomeJob( directory, jobId, p ); } - catch(const SALOME_Exception &ex){ + catch(const LauncherException &ex){ INFOS("Caught exception."); - THROW_SALOME_CORBA_EXCEPTION(ex.what(),SALOME::BAD_PARAM); - } -} - -//============================================================================= -/*! - * Factory to instanciate the good batch manager for choosen cluster. - */ -//============================================================================= - -BatchLight::BatchManager *SALOME_Launcher::FactoryBatchManager( const Engines::MachineParameters* params ) throw(SALOME_Exception) -{ - // Fill structure for batch manager - BatchLight::batchParams p; - p.hostname = params->alias; - p.protocol = params->protocol; - p.username = params->username; - p.applipath = params->applipath; - for(int i=0;imodList.length();i++) - p.modulesList.push_back((const char*)params->modList[i]); - p.nbnodes = params->nb_node; - p.nbprocpernode = params->nb_proc_per_node; - p.mpiImpl = params->mpiImpl; - - string sb = (const char*)params->batch; - if(sb == "pbs") - return new BatchLight::BatchManager_PBS(p); - else if(sb == "slurm") - return new BatchLight::BatchManager_SLURM(p); - else{ - MESSAGE("BATCH = " << params->batch); - throw SALOME_Exception("no batchmanager for that cluster"); + THROW_SALOME_CORBA_EXCEPTION(ex.msg.c_str(),SALOME::BAD_PARAM); } } diff --git a/src/Launcher/SALOME_Launcher.hxx b/src/Launcher/SALOME_Launcher.hxx index 281f6d8bf..c3ea5bbcf 100644 --- a/src/Launcher/SALOME_Launcher.hxx +++ b/src/Launcher/SALOME_Launcher.hxx @@ -23,8 +23,8 @@ #include #include CORBA_CLIENT_HEADER(SALOME_ContainerManager) #include "SALOME_ContainerManager.hxx" -#include "BatchLight_BatchManager.hxx" #include "BatchLight_BatchTest.hxx" +#include "Launcher.hxx" #include @@ -72,14 +72,13 @@ public: static const char *_LauncherNameInNS; protected: - BatchLight::BatchManager *FactoryBatchManager( const Engines::MachineParameters* params ) throw(SALOME_Exception); - - std::map _batchmap; CORBA::ORB_var _orb; PortableServer::POA_var _poa; SALOME_ContainerManager *_ContManager; SALOME_ResourcesManager *_ResManager; SALOME_NamingService *_NS; + + Launcher_cpp _l; }; #endif diff --git a/src/LifeCycleCORBA/TestContainerManager.cxx b/src/LifeCycleCORBA/TestContainerManager.cxx index b908e90f8..c53de4e84 100644 --- a/src/LifeCycleCORBA/TestContainerManager.cxx +++ b/src/LifeCycleCORBA/TestContainerManager.cxx @@ -149,7 +149,7 @@ int main (int argc, char * argv[]) } } string msg; - if( ((cmax-cmin) <= 2) && (fmax == 10/nbpmax) && !error ){ + if( ((cmax-cmin) <= 1) && (fmax == 10/nbpmax) && !error ){ if(bestImplemented) msg = "TEST OK"; else diff --git a/src/ResourcesManager/Makefile.am b/src/ResourcesManager/Makefile.am index dbb5bb380..b12707efa 100755 --- a/src/ResourcesManager/Makefile.am +++ b/src/ResourcesManager/Makefile.am @@ -38,7 +38,8 @@ salomeinclude_HEADERS = \ SALOME_ResourcesCatalog_Parser.hxx \ SALOME_ResourcesManager.hxx \ SALOME_ResourcesCatalog_Handler.hxx \ - SALOME_LoadRateManager.hxx + SALOME_LoadRateManager.hxx \ + ResourcesManager.hxx # # =============================================================== @@ -70,17 +71,28 @@ COMMON_LIBS =\ # Libraries targets # =============================================================== # -lib_LTLIBRARIES = libSalomeResourcesManager.la +lib_LTLIBRARIES = libResourcesManager.la libSalomeResourcesManager.la libSalomeResourcesManager_la_SOURCES =\ - SALOME_ResourcesCatalog_Parser.cxx \ - SALOME_ResourcesCatalog_Handler.cxx \ - SALOME_LoadRateManager.cxx \ SALOME_ResourcesManager.cxx libSalomeResourcesManager_la_CPPFLAGS =\ $(COMMON_CPPFLAGS) - libSalomeResourcesManager_la_LDFLAGS = -no-undefined -version-info=0:0:0 libSalomeResourcesManager_la_LIBADD =\ - $(COMMON_LIBS) + $(COMMON_LIBS) libResourcesManager.la + +libResourcesManager_la_SOURCES =\ + SALOME_ResourcesCatalog_Parser.cxx \ + SALOME_ResourcesCatalog_Handler.cxx \ + SALOME_LoadRateManager.cxx \ + ResourcesManager.cxx + +libResourcesManager_la_CPPFLAGS =\ + -I$(srcdir)/../Basics \ + -I$(srcdir)/../SALOMELocalTrace \ + @LIBXML_INCLUDES@ + +libResourcesManager_la_LDFLAGS = -no-undefined -version-info=0:0:0 +libResourcesManager_la_LIBADD =\ + @LIBXML_LIBS@ diff --git a/src/ResourcesManager/ResourcesManager.cxx b/src/ResourcesManager/ResourcesManager.cxx new file mode 100644 index 000000000..0d11897a7 --- /dev/null +++ b/src/ResourcesManager/ResourcesManager.cxx @@ -0,0 +1,486 @@ +// Copyright (C) 2005 OPEN CASCADE, EADS/CCR, LIP6, CEA/DEN, +// CEDRAT, EDF R&D, LEG, PRINCIPIA R&D, BUREAU VERITAS +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License. +// +// This library is distributed in the hope that it will be useful +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// +// See http://www.salome-platform.org/ or email : webmaster.salome@opencascade.com +// +#include "ResourcesManager.hxx" +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#define MAX_SIZE_FOR_HOSTNAME 256; + +using namespace std; + +//============================================================================= +/*! + * just for test + */ +//============================================================================= + +ResourcesManager_cpp:: +ResourcesManager_cpp(const char *xmlFilePath) : + _path_resources(xmlFilePath) +{ + cerr << "ResourcesManager_cpp constructor" << endl; +} + +//============================================================================= +/*! + * Standard constructor, parse resource file. + * - if ${APPLI} exists in environment, + * look for ${HOME}/${APPLI}/CatalogResources.xml + * - else look for default: + * ${KERNEL_ROOT_DIR}/share/salome/resources/kernel/CatalogResources.xml + * - parse XML resource file. + */ +//============================================================================= + +ResourcesManager_cpp::ResourcesManager_cpp() +{ + cerr << "ResourcesManager_cpp constructor" << endl; + _isAppliSalomeDefined = (getenv("APPLI") != 0); + + if (_isAppliSalomeDefined) + { + _path_resources = getenv("HOME"); + _path_resources += "/"; + _path_resources += getenv("APPLI"); + _path_resources += "/CatalogResources.xml"; + } + + else + { + _path_resources = getenv("KERNEL_ROOT_DIR"); + _path_resources += "/share/salome/resources/kernel/CatalogResources.xml"; + } + + ParseXmlFile(); + cerr << "ResourcesManager_cpp constructor end"; +} + +//============================================================================= +/*! + * Standard Destructor + */ +//============================================================================= + +ResourcesManager_cpp::~ResourcesManager_cpp() +{ + cerr << "ResourcesManager_cpp destructor" << endl; +} + +//============================================================================= +/*! + * get the list of name of ressources fitting for the specified module. + * If hostname specified, check it is local or known in resources catalog. + * + * Else + * - select first machines with corresponding OS (all machines if + * parameter OS empty), + * - then select the sublist of machines on witch the module is known + * (if the result is empty, that probably means that the inventory of + * modules is probably not done, so give complete list from previous step) + */ +//============================================================================= + +std::vector +ResourcesManager_cpp::GetFittingResources(const machineParams& params, + const std::vector& componentList) throw(ResourcesException) +{ +// cerr << "ResourcesManager_cpp::GetFittingResources" << endl; + vector vec; + + ParseXmlFile(); + + const char *hostname = params.hostname.c_str(); + cerr << "GetFittingResources " << hostname << " " << GetHostname().c_str() << endl; + + if (hostname[0] != '\0'){ + // cerr << "ResourcesManager_cpp::GetFittingResources : hostname specified" << endl; + + if ( strcmp(hostname, "localhost") == 0 || + strcmp(hostname, GetHostname().c_str()) == 0 ) + { + // cerr << "ResourcesManager_cpp::GetFittingResources : localhost" << endl; + vec.push_back(GetHostname().c_str()); + // cerr << "ResourcesManager_cpp::GetFittingResources : " << vec.size() << endl; + } + + else if (_resourcesList.find(hostname) != _resourcesList.end()) + { + // --- params.hostname is in the list of resources so return it. + vec.push_back(hostname); + } + else if (_resourcesBatchList.find(hostname) != _resourcesBatchList.end()) + { + // --- params.hostname is in the list of resources so return it. + vec.push_back(hostname); + } + else + { + // Cas d'un cluster: nombre de noeuds > 1 + int cpt=0; + for (map::const_iterator iter = _resourcesList.begin(); iter != _resourcesList.end(); iter++){ + if( (*iter).second.DataForSort._nbOfNodes > 1 ){ + if( strncmp(hostname,(*iter).first.c_str(),strlen(hostname)) == 0 ){ + vec.push_back((*iter).first.c_str()); + //cerr << "SALOME_ResourcesManager_cpp::GetFittingResources vector[" + // << cpt << "] = " << (*iter).first.c_str() << endl ; + cpt++; + } + } + } + if(cpt==0){ + // --- user specified an unknown hostame so notify him. + cerr << "ResourcesManager_cpp::GetFittingResources : SALOME_Exception" << endl; + throw ResourcesException("unknown host"); + } + } + } + + else{ + // --- Search for available resources sorted by priority + SelectOnlyResourcesWithOS(vec, params.OS.c_str()); + + KeepOnlyResourcesWithModule(vec, componentList); + + if (vec.size() == 0) + SelectOnlyResourcesWithOS(vec, params.OS.c_str()); + + // --- set wanted parameters + ResourceDataToSort::_nbOfNodesWanted = params.nb_node; + + ResourceDataToSort::_nbOfProcPerNodeWanted = params.nb_proc_per_node; + + ResourceDataToSort::_CPUFreqMHzWanted = params.cpu_clock; + + ResourceDataToSort::_memInMBWanted = params.mem_mb; + + // --- end of set + + list li; + + for (vector::iterator iter = vec.begin(); + iter != vec.end(); + iter++) + li.push_back(_resourcesList[(*iter)].DataForSort); + + li.sort(); + + unsigned int i = 0; + + for (list::iterator iter2 = li.begin(); + iter2 != li.end(); + iter2++) + vec[i++] = (*iter2)._hostName; + } + + return vec; + +} + +//============================================================================= +/*! + * add an entry in the ressources catalog xml file. + * Return 0 if OK (KERNEL found in new resources modules) else throw exception + */ +//============================================================================= + +int +ResourcesManager_cpp:: +AddResourceInCatalog(const machineParams& paramsOfNewResources, + const vector& modulesOnNewResources, + const char *alias, + const char *userName, + AccessModeType mode, + AccessProtocolType prot) +throw(ResourcesException) +{ + vector::const_iterator iter = find(modulesOnNewResources.begin(), + modulesOnNewResources.end(), + "KERNEL"); + + if (iter != modulesOnNewResources.end()) + { + ParserResourcesType newElt; + newElt.DataForSort._hostName = paramsOfNewResources.hostname; + newElt.Alias = alias; + newElt.Protocol = prot; + newElt.Mode = mode; + newElt.UserName = userName; + newElt.ModulesList = modulesOnNewResources; + newElt.OS = paramsOfNewResources.OS; + newElt.DataForSort._memInMB = paramsOfNewResources.mem_mb; + newElt.DataForSort._CPUFreqMHz = paramsOfNewResources.cpu_clock; + newElt.DataForSort._nbOfNodes = paramsOfNewResources.nb_node; + newElt.DataForSort._nbOfProcPerNode = + paramsOfNewResources.nb_proc_per_node; + _resourcesList[newElt.DataForSort._hostName] = newElt; + return 0; + } + + else + throw ResourcesException("KERNEL is not present in this resource"); +} + +//============================================================================= +/*! + * Deletes a resource from the catalog + */ +//============================================================================= + +void ResourcesManager_cpp::DeleteResourceInCatalog(const char *hostname) +{ + _resourcesList.erase(hostname); +} + +//============================================================================= +/*! + * write the current data in memory in file. + */ +//============================================================================= + +void ResourcesManager_cpp::WriteInXmlFile() +{ + const char* aFilePath = _path_resources.c_str(); + + FILE* aFile = fopen(aFilePath, "w"); + + if (aFile == NULL) + { + cerr << "Error opening file !" << endl; + return; + } + + xmlDocPtr aDoc = xmlNewDoc(BAD_CAST "1.0"); + xmlNewDocComment(aDoc, BAD_CAST "ResourcesCatalog"); + + SALOME_ResourcesCatalog_Handler* handler = + new SALOME_ResourcesCatalog_Handler(_resourcesList, _resourcesBatchList); + handler->PrepareDocToXmlFile(aDoc); + delete handler; + + int isOk = xmlSaveFile(aFilePath, aDoc); + + if (!isOk) + cerr << "Error while XML file saving." << endl; + + // Free the document + xmlFreeDoc(aDoc); + + fclose(aFile); + + cerr << "WRITING DONE!" << endl; +} + +//============================================================================= +/*! + * parse the data type catalog + */ +//============================================================================= + +const MapOfParserResourcesType& ResourcesManager_cpp::ParseXmlFile() +{ + SALOME_ResourcesCatalog_Handler* handler = + new SALOME_ResourcesCatalog_Handler(_resourcesList, _resourcesBatchList); + + const char* aFilePath = _path_resources.c_str(); + FILE* aFile = fopen(aFilePath, "r"); + + if (aFile != NULL) + { + xmlDocPtr aDoc = xmlReadFile(aFilePath, NULL, 0); + + if (aDoc != NULL) + handler->ProcessXmlDocument(aDoc); + else + cerr << "ResourcesManager_cpp: could not parse file "<< aFilePath << endl; + + // Free the document + xmlFreeDoc(aDoc); + + fclose(aFile); + } + else + cerr << "ResourcesManager_cpp: file "<& listOfMachines) +{ + return _dynamicResourcesSelecter.FindFirst(listOfMachines); +} + +//============================================================================= +/*! + * dynamically obtains the best machines + */ +//============================================================================= + +string ResourcesManager_cpp::FindNext(const std::vector& listOfMachines) +{ + return _dynamicResourcesSelecter.FindNext(listOfMachines,_resourcesList); +} +//============================================================================= +/*! + * dynamically obtains the best machines + */ +//============================================================================= + +string ResourcesManager_cpp::FindBest(const std::vector& listOfMachines) +{ + return _dynamicResourcesSelecter.FindBest(listOfMachines); +} + +//============================================================================= +/*! + * Gives a sublist of machines with matching OS. + * If parameter OS is empty, gives the complete list of machines + */ +//============================================================================= + +// Warning need an updated parsed list : _resourcesList +void ResourcesManager_cpp::SelectOnlyResourcesWithOS( vector& hosts, const char *OS) const +throw(ResourcesException) +{ + string base(OS); + + for (map::const_iterator iter = + _resourcesList.begin(); + iter != _resourcesList.end(); + iter++) + { + if ( (*iter).second.OS == base || base.size() == 0) + hosts.push_back((*iter).first); + } +} + + +//============================================================================= +/*! + * Gives a sublist of machines on which the module is known. + */ +//============================================================================= + +//Warning need an updated parsed list : _resourcesList +void ResourcesManager_cpp::KeepOnlyResourcesWithModule( vector& hosts, const vector& componentList) const +throw(ResourcesException) +{ + for (vector::iterator iter = hosts.begin(); iter != hosts.end();) + { + MapOfParserResourcesType::const_iterator it = _resourcesList.find(*iter); + const vector& mapOfModulesOfCurrentHost = (((*it).second).ModulesList); + + bool erasedHost = false; + if( mapOfModulesOfCurrentHost.size() > 0 ){ + for(int i=0;i::const_iterator itt = find(mapOfModulesOfCurrentHost.begin(), + mapOfModulesOfCurrentHost.end(), + compoi); +// componentList[i]); + if (itt == mapOfModulesOfCurrentHost.end()){ + erasedHost = true; + break; + } + } + } + if(erasedHost) + hosts.erase(iter); + else + iter++; + } +} + + +ParserResourcesType ResourcesManager_cpp::GetResourcesList(const std::string& machine) +{ + if (_resourcesList.find(machine) != _resourcesList.end()) + return _resourcesList[machine]; + else + return _resourcesBatchList[machine]; +} + +std::string ResourcesManager_cpp::GetHostname() +{ + int ls = 100, r = 1; + char *s; + + while (ls < 10000 && r) { + ls *= 2; + s = new char[ls]; + r = gethostname(s, ls-1); + switch (r) + { + case 0: + break; + default: +#ifdef EINVAL + case EINVAL: +#endif +#ifdef ENAMETOOLONG + case ENAMETOOLONG: +#endif + delete [] s; + continue; + } + } + + if (r != 0) { + s = new char[50]; + strcpy(s, "localhost"); + } + + // remove all after '.' + char *aDot = (strchr(s,'.')); + if (aDot) aDot[0] = '\0'; + + string p = s; + delete [] s; + return p; +} + diff --git a/src/ResourcesManager/ResourcesManager.hxx b/src/ResourcesManager/ResourcesManager.hxx new file mode 100644 index 000000000..951aba5da --- /dev/null +++ b/src/ResourcesManager/ResourcesManager.hxx @@ -0,0 +1,116 @@ +// Copyright (C) 2005 OPEN CASCADE, EADS/CCR, LIP6, CEA/DEN, +// CEDRAT, EDF R&D, LEG, PRINCIPIA R&D, BUREAU VERITAS +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License. +// +// This library is distributed in the hope that it will be useful +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// +// See http://www.salome-platform.org/ or email : webmaster.salome@opencascade.com +// +#ifndef __RESOURCESMANAGER_HXX__ +#define __RESOURCESMANAGER_HXX__ + +#include +#include +#include +#include "SALOME_ResourcesCatalog_Parser.hxx" +#include "SALOME_ResourcesCatalog_Handler.hxx" +#include "SALOME_LoadRateManager.hxx" + +// --- WARNING --- +// The call of BuildTempFileToLaunchRemoteContainer and RmTmpFile must be done +// in a critical section to be sure to be clean. +// Only one thread should use the SALOME_ResourcesManager class in a SALOME +// session. + +struct machineParams{ + std::string hostname; + std::string OS; + unsigned int nb_node; + unsigned int nb_proc_per_node; + unsigned int cpu_clock; + unsigned int mem_mb; +}; + +class ResourcesException +{ +public: + const std::string msg; + + ResourcesException(const std::string m) : msg(m) {} +}; + +class ResourcesManager_cpp + { + + public: + + ResourcesManager_cpp(const char *xmlFilePath); + ResourcesManager_cpp(); + + ~ResourcesManager_cpp(); + + std::vector + GetFittingResources(const machineParams& params, + const std::vector& componentList) throw(ResourcesException); + + std::string FindFirst(const std::vector& listOfMachines); + std::string FindNext(const std::vector& listOfMachines); + std::string FindBest(const std::vector& listOfMachines); + + int AddResourceInCatalog + (const machineParams& paramsOfNewResources, + const std::vector& modulesOnNewResources, + const char *alias, + const char *userName, + AccessModeType mode, + AccessProtocolType prot) throw(ResourcesException); + + void DeleteResourceInCatalog(const char *hostname); + + void WriteInXmlFile(); + + const MapOfParserResourcesType& ParseXmlFile(); + + const MapOfParserResourcesType& GetList() const; + + ParserResourcesType GetResourcesList(const std::string& machine); + + protected: + + void SelectOnlyResourcesWithOS(std::vector& hosts, + const char *OS) const + throw(ResourcesException); + + void KeepOnlyResourcesWithModule(std::vector& hosts, + const std::vector& componentList) const + throw(ResourcesException); + + //! will contain the path to the ressources catalog + std::string _path_resources; + + //! will contain the informations on the data type catalog(after parsing) + MapOfParserResourcesType _resourcesList; + + //! will contain the informations on the data type catalog(after parsing) + MapOfParserResourcesType _resourcesBatchList; + + SALOME_LoadRateManager _dynamicResourcesSelecter; + + //! different behaviour if $APPLI exists (SALOME Application) + bool _isAppliSalomeDefined; + + std::string GetHostname(); + }; + +#endif // __RESOURCESMANAGER_HXX__ diff --git a/src/ResourcesManager/SALOME_LoadRateManager.cxx b/src/ResourcesManager/SALOME_LoadRateManager.cxx index f0df795cb..85b90e8dc 100644 --- a/src/ResourcesManager/SALOME_LoadRateManager.cxx +++ b/src/ResourcesManager/SALOME_LoadRateManager.cxx @@ -18,82 +18,47 @@ // See http://www.salome-platform.org/ or email : webmaster.salome@opencascade.com // #include "SALOME_LoadRateManager.hxx" -#include "utilities.h" #include #include using namespace std; -string SALOME_LoadRateManager::FindFirst(const Engines::MachineList& hosts) +string SALOME_LoadRateManager::FindFirst(const vector& hosts) { - MESSAGE("SALOME_LoadRateManager::FindFirst " << hosts.length()); - - if (hosts.length() == 0) + if (hosts.size() == 0) return string(""); return string(hosts[0]); } -string SALOME_LoadRateManager::FindNext(const Engines::MachineList& hosts,MapOfParserResourcesType& resList,SALOME_NamingService *ns) +string SALOME_LoadRateManager::FindNext(const vector& hosts,MapOfParserResourcesType& resList) { - MESSAGE("SALOME_LoadRateManager::FindNext " << hosts.length()); - map machines; + static int imachine = 0; + static int iproc = 0; - if (hosts.length() == 0) + // if empty list return empty string + if (hosts.size() == 0) return string(""); - - for(int i=0;iChange_Directory("/Containers"); - vector vec = ns->list_directory_recurs(); - Engines::Container_var cont; - for(vector::iterator iter = vec.begin();iter!=vec.end();iter++){ - try - { - CORBA::Object_var obj=ns->Resolve((*iter).c_str()); - cont=Engines::Container::_narrow(obj); - } - catch(CORBA::SystemException& ex) - { - MESSAGE("SALOME_LoadRateManager::FindNext CORBA::SystemException ignore it"); - continue; - } - if(!CORBA::is_nil(cont)){ - try - { - CORBA::String_var hostname = cont->getHostName(); - std::string mach=(const char*)hostname; - machines[mach]++; - } - catch(CORBA::SystemException& ex) - { - MESSAGE("SALOME_LoadRateManager::FindNext CORBA::SystemException ignore it"); - continue; - } + else{ + ParserResourcesType resource = resList[string(hosts[imachine])]; + int nbproc = resource.DataForSort._nbOfProcPerNode * resource.DataForSort._nbOfNodes; + if( nbproc <= 0) nbproc = 1; + if( iproc < nbproc ){ + iproc++; + return string(hosts[imachine]); } - } - - int imin = 0; - ParserResourcesType resource = resList[string(hosts[0])]; - int nbproc = resource.DataForSort._nbOfProcPerNode * resource.DataForSort._nbOfNodes; - int min = machines[string(hosts[0])]/nbproc; - for(int i=1;i& hosts) { // for the moment then "maui" will be used for dynamic selection ... - MESSAGE("SALOME_LoadRateManager::FindBest " << hosts.length()); - throw(SALOME_Exception(LOCALIZED("not yet implemented"))); - return string(""); + return FindFirst(hosts); } diff --git a/src/ResourcesManager/SALOME_LoadRateManager.hxx b/src/ResourcesManager/SALOME_LoadRateManager.hxx index 061b925ae..7c4c3f03d 100644 --- a/src/ResourcesManager/SALOME_LoadRateManager.hxx +++ b/src/ResourcesManager/SALOME_LoadRateManager.hxx @@ -20,11 +20,8 @@ #ifndef __SALOME_LOADRATEMANAGER_HXX__ #define __SALOME_LOADRATEMANAGER_HXX__ -#include -#include CORBA_CLIENT_HEADER(SALOME_ContainerManager) #include #include "SALOME_ResourcesCatalog_Parser.hxx" -#include "SALOME_NamingService.hxx" #if defined RESOURCESMANAGER_EXPORTS #if defined WIN32 @@ -44,9 +41,9 @@ class RESOURCESMANAGER_EXPORT SALOME_LoadRateManager { public: - std::string FindFirst(const Engines::MachineList& hosts); - std::string FindNext(const Engines::MachineList& hosts,MapOfParserResourcesType& resList,SALOME_NamingService *ns); - std::string FindBest(const Engines::MachineList& hosts) throw (SALOME_Exception); + std::string FindFirst(const std::vector& hosts); + std::string FindNext(const std::vector& hosts,MapOfParserResourcesType& resList); + std::string FindBest(const std::vector& hosts); }; #endif diff --git a/src/ResourcesManager/SALOME_ResourcesCatalog_Handler.cxx b/src/ResourcesManager/SALOME_ResourcesCatalog_Handler.cxx index 181e40374..7d1c53e2a 100755 --- a/src/ResourcesManager/SALOME_ResourcesCatalog_Handler.cxx +++ b/src/ResourcesManager/SALOME_ResourcesCatalog_Handler.cxx @@ -52,7 +52,6 @@ SALOME_ResourcesCatalog_Handler(MapOfParserResourcesType& resources_list, _resources_list(resources_list), _resources_batch_list(resources_batch_list) { - MESSAGE("SALOME_ResourcesCatalog_Handler creation"); //XML tags initialisation test_machine = "machine"; test_resources = "resources"; @@ -82,7 +81,7 @@ SALOME_ResourcesCatalog_Handler(MapOfParserResourcesType& resources_list, SALOME_ResourcesCatalog_Handler::~SALOME_ResourcesCatalog_Handler() { - // MESSAGE("SALOME_ResourcesCatalog_Handler destruction"); + // cout << "SALOME_ResourcesCatalog_Handler destruction") << endl; } //============================================================================= @@ -105,7 +104,7 @@ SALOME_ResourcesCatalog_Handler::GetResourcesAfterParsing() const void SALOME_ResourcesCatalog_Handler::ProcessXmlDocument(xmlDocPtr theDoc) { - if (MYDEBUG) MESSAGE("Begin parse document"); +// if (MYDEBUG) cout << "Begin parse document" << endl; // Empty private elements _resources_list.clear(); @@ -191,8 +190,6 @@ void SALOME_ResourcesCatalog_Handler::ProcessXmlDocument(xmlDocPtr theDoc) _resource.Batch = pbs; else if (aBatch == "lsf") _resource.Batch = lsf; - else if (aBatch == "slurm") - _resource.Batch = slurm; else _resource.Batch = none; } @@ -210,6 +207,8 @@ void SALOME_ResourcesCatalog_Handler::ProcessXmlDocument(xmlDocPtr theDoc) _resource.mpi = mpich2; else if (anMpi == "openmpi") _resource.mpi = openmpi; + else if (anMpi == "slurm") + _resource.mpi = slurm; else _resource.mpi = indif; } @@ -325,7 +324,8 @@ void SALOME_ResourcesCatalog_Handler::ProcessXmlDocument(xmlDocPtr theDoc) SCRUTE((*iter).second.Mode); } - MESSAGE("This is the end of document"); +// cout << "This is the end of document" << endl; +// } } } diff --git a/src/ResourcesManager/SALOME_ResourcesCatalog_Parser.cxx b/src/ResourcesManager/SALOME_ResourcesCatalog_Parser.cxx index d0a75adb8..1ebc6cb41 100644 --- a/src/ResourcesManager/SALOME_ResourcesCatalog_Parser.cxx +++ b/src/ResourcesManager/SALOME_ResourcesCatalog_Parser.cxx @@ -18,8 +18,8 @@ // See http://www.salome-platform.org/ or email : webmaster.salome@opencascade.com // #include "SALOME_ResourcesCatalog_Parser.hxx" -#include "utilities.h" #include +#include #define NULL_VALUE 0 @@ -106,10 +106,10 @@ unsigned int ResourceDataToSort::GetNumberOfPoints() const //! Method used for debug void ResourceDataToSort::Print() const { - SCRUTE(_nbOfNodes); - SCRUTE(_nbOfProcPerNode); - SCRUTE(_CPUFreqMHz); - SCRUTE(_memInMB); + cout << _nbOfNodes << endl; + cout << _nbOfProcPerNode << endl; + cout << _CPUFreqMHz << endl; + cout << _memInMB << endl; } void ParserResourcesType::Print() const @@ -134,7 +134,7 @@ void ParserResourcesType::Print() const for(int i=0;iRegister(refContMan,_ResourcesManagerNameInNS); - _MpiStarted = false; - MESSAGE("constructor end"); + MESSAGE("SALOME_ResourcesManager constructor end"); } //============================================================================= @@ -86,9 +84,9 @@ SALOME_ResourcesManager(CORBA::ORB_ptr orb, SALOME_ResourcesManager::SALOME_ResourcesManager(CORBA::ORB_ptr orb, PortableServer::POA_var poa, - SALOME_NamingService *ns) + SALOME_NamingService *ns) : _rm() { - MESSAGE("constructor"); + MESSAGE("SALOME_ResourcesManager constructor"); _NS = ns; _orb = CORBA::ORB::_duplicate(orb) ; _poa = PortableServer::POA::_duplicate(poa) ; @@ -97,25 +95,7 @@ SALOME_ResourcesManager::SALOME_ResourcesManager(CORBA::ORB_ptr orb, Engines::ResourcesManager_var refContMan = Engines::ResourcesManager::_narrow(obj); _NS->Register(refContMan,_ResourcesManagerNameInNS); - _isAppliSalomeDefined = (getenv("APPLI") != 0); - _MpiStarted = false; - - if (_isAppliSalomeDefined) - { - _path_resources = getenv("HOME"); - _path_resources += "/"; - _path_resources += getenv("APPLI"); - _path_resources += "/CatalogResources.xml"; - } - - else - { - _path_resources = getenv("KERNEL_ROOT_DIR"); - _path_resources += "/share/salome/resources/kernel/CatalogResources.xml"; - } - - ParseXmlFile(); - MESSAGE("constructor end"); + MESSAGE("SALOME_ResourcesManager constructor end"); } //============================================================================= @@ -126,7 +106,7 @@ SALOME_ResourcesManager::SALOME_ResourcesManager(CORBA::ORB_ptr orb, SALOME_ResourcesManager::~SALOME_ResourcesManager() { - MESSAGE("destructor"); + MESSAGE("SALOME_ResourcesManager destructor"); } @@ -163,256 +143,34 @@ Engines::MachineList * SALOME_ResourcesManager::GetFittingResources(const Engines::MachineParameters& params, const Engines::CompoList& componentList) { - vector vec; +// MESSAGE("ResourcesManager::GetFittingResources"); + machineParams p; + p.hostname = params.hostname; + p.OS = params.OS; + p.nb_node = params.nb_node; + p.nb_proc_per_node = params.nb_proc_per_node; + p.cpu_clock = params.cpu_clock; + p.mem_mb = params.mem_mb; + + vector cl; + for(int i=0;i 1 - int cpt=0; - for (map::const_iterator iter = _resourcesList.begin(); iter != _resourcesList.end(); iter++){ - if( (*iter).second.DataForSort._nbOfNodes > 1 ){ - if( strncmp(hostname,(*iter).first.c_str(),strlen(hostname)) == 0 ){ - vec.push_back((*iter).first.c_str()); - //cout << "SALOME_ResourcesManager::GetFittingResources vector[" - // << cpt << "] = " << (*iter).first.c_str() << endl ; - cpt++; - } - } - } - if(cpt==0){ - // --- user specified an unknown hostame so notify him. - MESSAGE("ResourcesManager::GetFittingResources : SALOME_Exception"); - throw SALOME_Exception("unknown host"); - } - } - } - - else - // --- Search for available resources sorted by priority - { - SelectOnlyResourcesWithOS(vec, params.OS); - - KeepOnlyResourcesWithModule(vec, componentList); - - if (vec.size() == 0) - SelectOnlyResourcesWithOS(vec, params.OS); - - // --- set wanted parameters - ResourceDataToSort::_nbOfNodesWanted = params.nb_node; - - ResourceDataToSort::_nbOfProcPerNodeWanted = params.nb_proc_per_node; - - ResourceDataToSort::_CPUFreqMHzWanted = params.cpu_clock; - - ResourceDataToSort::_memInMBWanted = params.mem_mb; - - // --- end of set - - list li; - - for (vector::iterator iter = vec.begin(); - iter != vec.end(); - iter++) - li.push_back(_resourcesList[(*iter)].DataForSort); - - li.sort(); - - unsigned int i = 0; - - for (list::iterator iter2 = li.begin(); - iter2 != li.end(); - iter2++) - vec[i++] = (*iter2)._hostName; - } - - // MESSAGE("ResourcesManager::GetFittingResources : return" << ret.size()); - ret->length(vec.size()); - for(unsigned int i=0;i vec = _rm.GetFittingResources(p,cl); + ret->length(vec.size()); + for(int i=0;i& modulesOnNewResources, - const char *alias, - const char *userName, - AccessModeType mode, - AccessProtocolType prot) -throw(SALOME_Exception) -{ - vector::const_iterator iter = find(modulesOnNewResources.begin(), - modulesOnNewResources.end(), - "KERNEL"); - - if (iter != modulesOnNewResources.end()) - { - ParserResourcesType newElt; - newElt.DataForSort._hostName = paramsOfNewResources.hostname; - newElt.HostName = paramsOfNewResources.hostname; - newElt.Alias = alias; - newElt.Protocol = prot; - newElt.Mode = mode; - newElt.UserName = userName; - newElt.ModulesList = modulesOnNewResources; - newElt.OS = paramsOfNewResources.OS; - newElt.DataForSort._memInMB = paramsOfNewResources.mem_mb; - newElt.DataForSort._CPUFreqMHz = paramsOfNewResources.cpu_clock; - newElt.DataForSort._nbOfNodes = paramsOfNewResources.nb_node; - newElt.DataForSort._nbOfProcPerNode = - paramsOfNewResources.nb_proc_per_node; - _resourcesList[newElt.DataForSort._hostName] = newElt; - return 0; - } - - else - throw SALOME_Exception("KERNEL is not present in this resource"); -} - -//============================================================================= -/*! - * Deletes a resource from the catalog - */ -//============================================================================= - -void SALOME_ResourcesManager::DeleteResourceInCatalog(const char *hostname) -{ - _resourcesList.erase(hostname); -} - -//============================================================================= -/*! - * write the current data in memory in file. - */ -//============================================================================= - -void SALOME_ResourcesManager::WriteInXmlFile() -{ - const char* aFilePath = _path_resources.c_str(); - - FILE* aFile = fopen(aFilePath, "w"); - - if (aFile == NULL) - { - INFOS("Error opening file !"); - return; - } - - xmlDocPtr aDoc = xmlNewDoc(BAD_CAST "1.0"); - xmlNewDocComment(aDoc, BAD_CAST "ResourcesCatalog"); - - SALOME_ResourcesCatalog_Handler* handler = - new SALOME_ResourcesCatalog_Handler(_resourcesList, _resourcesBatchList); - handler->PrepareDocToXmlFile(aDoc); - delete handler; - - int isOk = xmlSaveFile(aFilePath, aDoc); - - if (!isOk) - INFOS("Error while XML file saving."); - - // Free the document - xmlFreeDoc(aDoc); - - fclose(aFile); - - MESSAGE("WRITING DONE!"); -} - -//============================================================================= -/*! - * parse the data type catalog - */ -//============================================================================= - -const MapOfParserResourcesType& SALOME_ResourcesManager::ParseXmlFile() -{ - SALOME_ResourcesCatalog_Handler* handler = - new SALOME_ResourcesCatalog_Handler(_resourcesList, _resourcesBatchList); - - const char* aFilePath = _path_resources.c_str(); - FILE* aFile = fopen(aFilePath, "r"); - - if (aFile != NULL) - { - xmlDocPtr aDoc = xmlReadFile(aFilePath, NULL, 0); - - if (aDoc != NULL) - handler->ProcessXmlDocument(aDoc); - else - INFOS("ResourcesManager: could not parse file "< ml; + for(int i=0;i= 2) - if (strcmp(ContainerName + len - 2, "Py") == 0) - ret = true; - - return ret; -} - - -//============================================================================= -/*! - * Builds the script to be launched - * - * If SALOME Application not defined ($APPLI), - * see BuildTempFileToLaunchRemoteContainer() - * - * Else rely on distant configuration. Command is under the form (example): - * ssh user@machine distantPath/runRemote.sh hostNS portNS WORKINGDIR workingdir \ - * SALOME_Container containerName &" - - * - where user is ommited if not specified in CatalogResources, - * - where distant path is always relative to user@machine $HOME, and - * equal to $APPLI if not specified in CatalogResources, - * - where hostNS is the hostname of CORBA naming server (set by scripts to - * use to launch SALOME and servers in $APPLI: runAppli.sh, runRemote.sh) - * - where portNS is the port used by CORBA naming server (set by scripts to - * use to launch SALOME and servers in $APPLI: runAppli.sh, runRemote.sh) - * - where workingdir is the requested working directory for the container. - * If WORKINGDIR (and workingdir) is not present the working dir will be $HOME - */ -//============================================================================= - -string -SALOME_ResourcesManager::BuildCommandToLaunchRemoteContainer -(const string& machine, - const Engines::MachineParameters& params, const long id) -{ - string command; - int nbproc; - char idc[3*sizeof(long)]; - - if ( ! _isAppliSalomeDefined ) - command = BuildTempFileToLaunchRemoteContainer(machine, params); - - else - { - const ParserResourcesType& resInfo = _resourcesList[machine]; - - if (params.isMPI) - { - if ( (params.nb_node <= 0) && (params.nb_proc_per_node <= 0) ) - nbproc = 1; - else if ( params.nb_node == 0 ) - nbproc = params.nb_proc_per_node; - else if ( params.nb_proc_per_node == 0 ) - nbproc = params.nb_node; - else - nbproc = params.nb_node * params.nb_proc_per_node; - } - - // "ssh user@machine distantPath/runRemote.sh hostNS portNS WORKINGDIR workingdir \ - // SALOME_Container containerName &" - - if (resInfo.Protocol == rsh) - command = "rsh "; - else if (resInfo.Protocol == ssh) - command = "ssh "; - else - throw SALOME_Exception("Unknown protocol"); - - if (resInfo.UserName != "") - { - command += resInfo.UserName; - command += "@"; - } - - command += machine; - command += " "; - - if (resInfo.AppliPath != "") - command += resInfo.AppliPath; // path relative to user@machine $HOME - else - { - ASSERT(getenv("APPLI")); - command += getenv("APPLI"); // path relative to user@machine $HOME - } - - command += "/runRemote.sh "; - - ASSERT(getenv("NSHOST")); - command += getenv("NSHOST"); // hostname of CORBA name server - - command += " "; - ASSERT(getenv("NSPORT")); - command += getenv("NSPORT"); // port of CORBA name server - - std::string wdir=params.workingdir.in(); - if(wdir != "") - { - command += " WORKINGDIR "; - command += " '"; - if(wdir == "$TEMPDIR") - wdir="\\$TEMPDIR"; - command += wdir; // requested working directory - command += "'"; - } - - if(params.isMPI) - { - command += " mpirun -np "; - std::ostringstream o; - o << nbproc << " "; - command += o.str(); -#ifdef WITHLAM - command += "-x PATH,LD_LIBRARY_PATH,OMNIORB_CONFIG,SALOME_trace "; -#endif - command += " SALOME_MPIContainer "; - } - else - command += " SALOME_Container "; - - command += _NS->ContainerName(params); - command += " -id "; - sprintf(idc,"%ld",id); - command += idc; - command += " -"; - AddOmninamesParams(command); - - MESSAGE("command =" << command); - } - - return command; -} - -//============================================================================= -/*! - * builds the command to be launched. - */ -//============================================================================= - -string -SALOME_ResourcesManager::BuildCommandToLaunchLocalContainer -(const Engines::MachineParameters& params, const long id) -{ - _TmpFileName = ""; - string command; - int nbproc = 0; - char idc[3*sizeof(long)]; - - if (params.isMPI) - { - command = "mpirun -np "; - - if ( (params.nb_node <= 0) && (params.nb_proc_per_node <= 0) ) - nbproc = 1; - else if ( params.nb_node == 0 ) - nbproc = params.nb_proc_per_node; - else if ( params.nb_proc_per_node == 0 ) - nbproc = params.nb_node; - else - nbproc = params.nb_node * params.nb_proc_per_node; - - std::ostringstream o; - - o << nbproc << " "; - - command += o.str(); -#ifdef WITHLAM - command += "-x PATH,LD_LIBRARY_PATH,OMNIORB_CONFIG,SALOME_trace "; -#endif - - if (isPythonContainer(params.container_name)) - command += "pyMPI SALOME_ContainerPy.py "; - else - command += "SALOME_MPIContainer "; - } - - else - { - command=""; - std::string wdir=params.workingdir.in(); - if(wdir != "") - { - // a working directory is requested - if(wdir == "$TEMPDIR") - { - // a new temporary directory is requested - char dir[]="/tmp/salomeXXXXXX"; - char* mdir=mkdtemp(dir); - if(mdir==NULL) - std::cerr << "Problem in mkdtemp " << dir << " " << mdir << std::endl; - else - command="cd "+std::string(dir)+";"; - } - else - { - // a permanent directory is requested use it or create it - command="mkdir -p " + wdir + " && cd " + wdir + ";"; - } - } - if (isPythonContainer(params.container_name)) - command += "SALOME_ContainerPy.py "; - else - command += "SALOME_Container "; - } - - command += _NS->ContainerName(params); - command += " -id "; - sprintf(idc,"%ld",id); - command += idc; - command += " -"; - AddOmninamesParams(command); - - MESSAGE("Command is ... " << command); - return command; -} - - -//============================================================================= -/*! - * removes the generated temporary file in case of a remote launch. - */ -//============================================================================= - -void SALOME_ResourcesManager::RmTmpFile() -{ - if (_TmpFileName != "") - { -#ifndef WNT - string command = "rm "; -#else - string command = "del /F "; -#endif - command += _TmpFileName; - char *temp = strdup(command.c_str()); - int lgthTemp = strlen(temp); - temp[lgthTemp - 3] = '*'; - temp[lgthTemp - 2] = '\0'; - system(temp); - free(temp); - } -} - - -//============================================================================= -/*! - * builds the script to be launched - */ -//============================================================================= - -string -SALOME_ResourcesManager::BuildCommand -(const string& machine, - const char *containerName) -{ - // rsh -n ikkyo /export/home/rahuel/SALOME_ROOT/bin/runSession SALOME_Container -ORBInitRef NameService=corbaname::dm2s0017:1515 & - const ParserResourcesType& resInfo = _resourcesList[machine]; - bool pyCont = isPythonContainer(containerName); - - string command; - - if (resInfo.Protocol == rsh) - command = "rsh -n " ; - else if (resInfo.Protocol == ssh) - command = "ssh -f -n "; - else - throw SALOME_Exception("Not implemented yet..."); - - command += machine; - command += " "; - string path = getenv("KERNEL_ROOT_DIR"); - command += path; - command += "/bin/salome/"; - - if ( pyCont ) - command += "SALOME_ContainerPy.py "; - else - command += "SALOME_Container "; - - command += containerName; - command += " -"; - AddOmninamesParams(command); - - SCRUTE( command ); - return command; -} - -//============================================================================= -/*! - * Gives a sublist of machines with matching OS. - * If parameter OS is empty, gives the complete list of machines - */ -//============================================================================= - -// Warning need an updated parsed list : _resourcesList -void -SALOME_ResourcesManager::SelectOnlyResourcesWithOS -( vector& hosts, - const char *OS) const -throw(SALOME_Exception) -{ - string base(OS); - - for (map::const_iterator iter = - _resourcesList.begin(); - iter != _resourcesList.end(); - iter++) - { - if ( (*iter).second.OS == base || base.size() == 0) - hosts.push_back((*iter).first); - } -} - - -//============================================================================= -/*! - * Gives a sublist of machines on which the module is known. - */ -//============================================================================= - -//Warning need an updated parsed list : _resourcesList -void -SALOME_ResourcesManager::KeepOnlyResourcesWithModule -( vector& hosts, - const Engines::CompoList& componentList) const -throw(SALOME_Exception) -{ - for (vector::iterator iter = hosts.begin(); iter != hosts.end();) - { - MapOfParserResourcesType::const_iterator it = _resourcesList.find(*iter); - const vector& mapOfModulesOfCurrentHost = (((*it).second).ModulesList); - - bool erasedHost = false; - if( mapOfModulesOfCurrentHost.size() > 0 ){ - for(int i=0;i::const_iterator itt = find(mapOfModulesOfCurrentHost.begin(), - mapOfModulesOfCurrentHost.end(), - compoi); -// componentList[i]); - if (itt == mapOfModulesOfCurrentHost.end()){ - erasedHost = true; - break; - } - } - } - if(erasedHost) - hosts.erase(iter); - else - iter++; - } -} - - -//============================================================================= -/*! - * add to command all options relative to naming service. - */ -//============================================================================= - -void SALOME_ResourcesManager::AddOmninamesParams(string& command) const - { - // If env variable OMNIORB_CONFIG is not defined or the file is more complex than one line - // does not work - // Even if we use it we have to check if env variable exists - //string omniORBcfg( getenv( "OMNIORB_CONFIG" ) ) ; - //ifstream omniORBfile( omniORBcfg.c_str() ) ; - //char ORBInitRef[11] ; - //char egal[3] ; - //char nameservice[132] ; - //omniORBfile >> ORBInitRef ; - //command += "ORBInitRef " ; - //omniORBfile >> egal ; - //omniORBfile >> nameservice ; - //omniORBfile.close() ; - //char * bsn = strchr( nameservice , '\n' ) ; - //if ( bsn ) { - //bsn[ 0 ] = '\0' ; - //} - //command += nameservice ; - - CORBA::String_var iorstr = _NS->getIORaddr(); - command += "ORBInitRef NameService="; - command += iorstr; - } - - -//============================================================================= -/*! - * add to command all options relative to naming service. - */ -//============================================================================= - -void SALOME_ResourcesManager::AddOmninamesParams(ofstream& fileStream) const - { - CORBA::String_var iorstr = _NS->getIORaddr(); - fileStream << "ORBInitRef NameService="; - fileStream << iorstr; - } - - -//============================================================================= -/*! - * generate a file name in /tmp directory - */ -//============================================================================= - -string SALOME_ResourcesManager::BuildTemporaryFileName() const - { - //build more complex file name to support multiple salome session - char *temp = new char[19]; - strcpy(temp, "/tmp/command"); - strcat(temp, "XXXXXX"); -#ifndef WNT - - mkstemp(temp); -#else - - char aPID[80]; - itoa(getpid(), aPID, 10); - strcat(temp, aPID); -#endif - - string command(temp); - delete [] temp; - command += ".sh"; - return command; - } - - -//============================================================================= -/*! - * Builds in a temporary file the script to be launched. - * - * Used if SALOME Application ($APPLI) is not defined. - * The command is build with data from CatalogResources, in which every path - * used on remote computer must be defined. - */ -//============================================================================= - -string -SALOME_ResourcesManager::BuildTempFileToLaunchRemoteContainer -(const string& machine, - const Engines::MachineParameters& params) throw(SALOME_Exception) -{ - int status; - - _TmpFileName = BuildTemporaryFileName(); - ofstream tempOutputFile; - tempOutputFile.open(_TmpFileName.c_str(), ofstream::out ); - const ParserResourcesType& resInfo = _resourcesList[machine]; - tempOutputFile << "#! /bin/sh" << endl; - - // --- set env vars - - tempOutputFile << "export SALOME_trace=local" << endl; // mkr : 27.11.2006 : PAL13967 - Distributed supervision graphs - Problem with "SALOME_trace" - //tempOutputFile << "source " << resInfo.PreReqFilePath << endl; - - // ! env vars - - if (params.isMPI) - { - tempOutputFile << "mpirun -np "; - int nbproc; - - if ( (params.nb_node <= 0) && (params.nb_proc_per_node <= 0) ) - nbproc = 1; - else if ( params.nb_node == 0 ) - nbproc = params.nb_proc_per_node; - else if ( params.nb_proc_per_node == 0 ) - nbproc = params.nb_node; - else - nbproc = params.nb_node * params.nb_proc_per_node; - - std::ostringstream o; - - tempOutputFile << nbproc << " "; -#ifdef WITHLAM - tempOutputFile << "-x PATH,LD_LIBRARY_PATH,OMNIORB_CONFIG,SALOME_trace "; -#endif - } - - tempOutputFile << getenv("KERNEL_ROOT_DIR") << "/bin/salome/"; - - if (params.isMPI) - { - if (isPythonContainer(params.container_name)) - tempOutputFile << "pyMPI SALOME_ContainerPy.py "; - else - tempOutputFile << "SALOME_MPIContainer "; - } - - else - { - if (isPythonContainer(params.container_name)) - tempOutputFile << "SALOME_ContainerPy.py "; - else - tempOutputFile << "SALOME_Container "; - } - - tempOutputFile << _NS->ContainerName(params) << " -"; - AddOmninamesParams(tempOutputFile); - tempOutputFile << " &" << endl; - tempOutputFile.flush(); - tempOutputFile.close(); - chmod(_TmpFileName.c_str(), 0x1ED); - - // --- Build command - - string command; - - if (resInfo.Protocol == rsh) - { - command = "rsh "; - string commandRcp = "rcp "; - commandRcp += _TmpFileName; - commandRcp += " "; - commandRcp += machine; - commandRcp += ":"; - commandRcp += _TmpFileName; - status = system(commandRcp.c_str()); - } - - else if (resInfo.Protocol == ssh) - { - command = "ssh "; - string commandRcp = "scp "; - commandRcp += _TmpFileName; - commandRcp += " "; - commandRcp += machine; - commandRcp += ":"; - commandRcp += _TmpFileName; - status = system(commandRcp.c_str()); - } - else - throw SALOME_Exception("Unknown protocol"); - - if(status) - throw SALOME_Exception("Error of connection on remote host"); - - command += machine; - _CommandForRemAccess = command; - command += " "; - command += _TmpFileName; - - SCRUTE(command); - - return command; - -} - -//============================================================================= -/*! Creates a command line that the container manager uses to launch - * a parallel container. - */ -//============================================================================= -string -SALOME_ResourcesManager::BuildCommandToLaunchLocalParallelContainer(const std::string& exe_name, - const Engines::MachineParameters& params, - const std::string& log) -{ - // This method knows the differences between the proxy and the nodes. - // nb_component_nodes is not used in the same way if it is a proxy or - // a node. - - string command; - string parallelLib(CORBA::string_dup(params.parallelLib)); - string hostname(CORBA::string_dup(params.hostname)); - int par = exe_name.find("Proxy"); - int nbproc = params.nb_component_nodes; - char buffer [33]; - sprintf(buffer,"%d",nbproc); - - Engines::MachineParameters_var rtn = new Engines::MachineParameters(); - rtn->container_name = params.container_name; - rtn->hostname = params.hostname; - rtn->OS = params.OS; - rtn->mem_mb = params.mem_mb; - rtn->cpu_clock = params.cpu_clock; - rtn->nb_proc_per_node = params.nb_proc_per_node; - rtn->nb_node = params.nb_node; - rtn->isMPI = params.isMPI; - - string real_exe_name = exe_name + parallelLib; - - if (parallelLib == "Dummy") - { - //command = "gdb --args "; - //command = "valgrind --tool=memcheck --log-file=val_log "; - //command += real_exe_name; - - command = real_exe_name; - - command += " " + _NS->ContainerName(rtn); - command += " " + parallelLib; - command += " " + hostname; - command += " -"; - AddOmninamesParams(command); - } - - else if (parallelLib == "Mpi") - { - // Step 1 : check if MPI is started - if (_MpiStarted == false) - { - startMPI(); - } - - if (par < 0) - { - // Nodes case - - command = "mpiexec -np " + string(buffer) + " "; -// command += "gdb --args "; - command += real_exe_name; - command += " " + _NS->ContainerName(rtn); - command += " " + parallelLib; - command += " " + hostname; - command += " -"; - AddOmninamesParams(command); - } - else - { - // Proxy case - command = "mpiexec -np 1 "; - command += real_exe_name; - command += " " + _NS->ContainerName(rtn); - command += " " + string(buffer); - command += " " + parallelLib; - command += " " + hostname; - command += " -"; - AddOmninamesParams(command); - } - } - else - { - std::string message("Unknown parallelLib" + parallelLib); - throw SALOME_Exception(message.c_str()); - } - - // log choice - if (log == "default") - { - command += " > /tmp/"; - command += _NS->ContainerName(rtn); - command += "_"; - command += GetHostname(); - command += "_"; - command += getenv( "USER" ) ; - command += ".log 2>&1 &" ; - } - if (log == "xterm") - { - command = "/usr/X11R6/bin/xterm -e \"export LD_LIBRARY_PATH=$LD_LIBRARY_PATH; export PATH=$PATH; " - + command + " \" &"; -// + command + "; echo $LD_LIBRARY_PATH; cat \" &"; - } - return command; - -/* if (log == "xterm") - { - command = "/usr/X11R6/bin/xterm -e \"export LD_LIBRARY_PATH=$LD_LIBRARY_PATH; export PATH=$PATH; echo $LD_LIBRARY_PATH; echo $PATH; " + command + "; cat \" &"; - } -*/ -/* command = "cd ; rm " + fichier_commande + "; touch " + \ - fichier_commande + "; echo \" export LD_LIBRARY_PATH=$LD_LIBRARY_PATH; " + \ - command + " >& /tmp/ribes_" + fichier_commande + " & \" > " + fichier_commande + ";"; - command += "ssh cn01 sh " + fichier_commande + " &"; - cerr << "La commande : " << command << endl; -*/ -} - -void SALOME_ResourcesManager::startMPI() -{ - cerr << "----------------------------------------------" << endl; - cerr << "----------------------------------------------" << endl; - cerr << "----------------------------------------------" << endl; - cerr << "-Only Lam on Localhost is currently supported-" << endl; - cerr << "----------------------------------------------" << endl; - cerr << "----------------------------------------------" << endl; - cerr << "----------------------------------------------" << endl; - - int status = system("lamboot"); - if (status == -1) - { - INFOS("lamboot failed : system command status -1"); - } - else if (status == 217) - { - INFOS("lamboot failed : system command status 217"); - } - else - { - _MpiStarted = true; - } + return CORBA::string_dup(_rm.FindFirst(ml).c_str()); } Engines::MachineParameters* SALOME_ResourcesManager::GetMachineParameters(const char *hostname) { - ParserResourcesType resource; - if (_resourcesList.find(hostname) != _resourcesList.end()) - resource = _resourcesList[string(hostname)]; - else - resource = _resourcesBatchList[string(hostname)]; - + ParserResourcesType resource = _rm.GetResourcesList(string(hostname)); Engines::MachineParameters *p_ptr = new Engines::MachineParameters; p_ptr->container_name = CORBA::string_dup(""); p_ptr->hostname = CORBA::string_dup(resource.HostName.c_str()); @@ -1200,3 +227,4 @@ Engines::MachineParameters* SALOME_ResourcesManager::GetMachineParameters(const return p_ptr; } + diff --git a/src/ResourcesManager/SALOME_ResourcesManager.hxx b/src/ResourcesManager/SALOME_ResourcesManager.hxx index af258b807..f2fdc259b 100644 --- a/src/ResourcesManager/SALOME_ResourcesManager.hxx +++ b/src/ResourcesManager/SALOME_ResourcesManager.hxx @@ -30,6 +30,7 @@ #include #include #include +#include "ResourcesManager.hxx" #if defined RESOURCESMANAGER_EXPORTS #if defined WIN32 @@ -66,99 +67,29 @@ class RESOURCESMANAGER_EXPORT SALOME_ResourcesManager: Engines::MachineList * GetFittingResources(const Engines::MachineParameters& params, const Engines::CompoList& componentList); -// throw(SALOME_Exception); char* FindFirst(const Engines::MachineList& listOfMachines); - std::string FindNext(const Engines::MachineList& listOfMachines); - std::string FindBest(const Engines::MachineList& listOfMachines); - std::string BuildCommandToLaunchRemoteContainer - (const std::string& machine, - const Engines::MachineParameters& params, const long id); - - std::string BuildCommandToLaunchLocalContainer - (const Engines::MachineParameters& params, const long id); - - void RmTmpFile(); - - std::string BuildCommand(const std::string& machine, - const char *containerName); - - int AddResourceInCatalog - (const Engines::MachineParameters& paramsOfNewResources, - const std::vector& modulesOnNewResources, - const char *alias, - const char *userName, - AccessModeType mode, - AccessProtocolType prot) - throw(SALOME_Exception); - - void DeleteResourceInCatalog(const char *hostname); - - void WriteInXmlFile(); - - const MapOfParserResourcesType& ParseXmlFile(); - - const MapOfParserResourcesType& GetList() const; - - // Parallel extension - std::string BuildCommandToLaunchLocalParallelContainer(const std::string& exe_name, - const Engines::MachineParameters& params, - const std::string& log = "default"); Engines::MachineParameters* GetMachineParameters(const char *hostname); void Shutdown(); static const char *_ResourcesManagerNameInNS; + ResourcesManager_cpp *GetImpl() { return &_rm; } + protected: - // Parallel extension - void startMPI(); - bool _MpiStarted; - SALOME_NamingService *_NS; CORBA::ORB_var _orb; PortableServer::POA_var _poa; - std::string BuildTempFileToLaunchRemoteContainer - (const std::string& machine, - const Engines::MachineParameters& params) throw(SALOME_Exception); - - void SelectOnlyResourcesWithOS(std::vector& hosts, - const char *OS) const - throw(SALOME_Exception); - - void KeepOnlyResourcesWithModule(std::vector& hosts, - const Engines::CompoList& componentList) const - throw(SALOME_Exception); - - void AddOmninamesParams(std::string& command) const; - - void AddOmninamesParams(std::ofstream& fileStream) const; - - std::string BuildTemporaryFileName() const; - - //! will contain the path to the ressources catalog - std::string _path_resources; - - //! attribute that contains current tmp files generated - std::string _TmpFileName; - - //! contains the rsh or ssh command to access directly to machine. - // Only used by this->RmTmpFile in case of a remote launch. - std::string _CommandForRemAccess; - - //! will contain the informations on the data type catalog(after parsing) - MapOfParserResourcesType _resourcesList; // //! will contain the informations on the data type catalog(after parsing) MapOfParserResourcesType _resourcesBatchList; - SALOME_LoadRateManager _dynamicResourcesSelecter; + ResourcesManager_cpp _rm; - //! different behaviour if $APPLI exists (SALOME Application) - bool _isAppliSalomeDefined; }; #endif // RESSOURCESCATALOG_IMPL_H -- 2.39.2