1 // Copyright (C) 2005 OPEN CASCADE, EADS/CCR, LIP6, CEA/DEN,
2 // CEDRAT, EDF R&D, LEG, PRINCIPIA R&D, BUREAU VERITAS
4 // This library is free software; you can redistribute it and/or
5 // modify it under the terms of the GNU Lesser General Public
6 // License as published by the Free Software Foundation; either
7 // version 2.1 of the License.
9 // This library is distributed in the hope that it will be useful
10 // but WITHOUT ANY WARRANTY; without even the implied warranty of
11 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 // Lesser General Public License for more details.
14 // You should have received a copy of the GNU Lesser General Public
15 // License along with this library; if not, write to the Free Software
16 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 // See http://www.salome-platform.org/ or email : webmaster.salome@opencascade.com
23 * Auteur : Bernard SECHER - CEA/DEN
29 #include "BatchLight_BatchManager_SLURM.hxx"
30 #include "utilities.h"
31 #include "BatchLight_Job.hxx"
39 namespace BatchLight {
42 BatchManager_SLURM::BatchManager_SLURM(const batchParams& p) throw(SALOME_Exception) : BatchManager(p)
47 BatchManager_SLURM::~BatchManager_SLURM()
49 MESSAGE("BatchManager_SLURM destructor "<<_params.hostname);
52 // Methode pour le controle des jobs : retire un job du gestionnaire
53 void BatchManager_SLURM::deleteJob(const int & jobid)
55 BEGIN_OF("BatchManager_SLURM::deleteJob");
61 // define command to submit batch
62 if( _params.protocol == "rsh" )
64 else if( _params.protocol == "ssh" )
67 throw SALOME_Exception("Unknown protocol");
69 if (_params.username != ""){
70 command += _params.username;
74 command += _params.hostname;
75 command += " \"bkill " ;
78 SCRUTE(command.c_str());
79 status = system(command.c_str());
81 throw SALOME_Exception("Error of connection on remote host");
83 MESSAGE("jobId = " << jobid << "killed");
84 END_OF("BatchManager_SLURM::deleteJob");
87 // Methode pour le controle des jobs : renvoie l'etat du job
88 string BatchManager_SLURM::queryJob(const int & jobid)
90 BEGIN_OF("BatchManager_SLURM::queryJob");
91 // define name of log file
92 string logFile="/tmp/logs/";
93 logFile += getenv("USER");
94 logFile += "/batchSalome_";
100 logFile += oss.str();
106 // define command to submit batch
107 if( _params.protocol == "rsh" )
109 else if( _params.protocol == "ssh" )
112 throw SALOME_Exception("Unknown protocol");
114 if (_params.username != ""){
115 command += _params.username;
119 command += _params.hostname;
120 command += " \"bjobs " ;
123 command += oss2.str();
126 SCRUTE(command.c_str());
127 status = system(command.c_str());
129 throw SALOME_Exception("Error of connection on remote host");
131 // read staus of job in log file
133 ifstream fp(logFile.c_str(),ios::in);
134 fp.getline(line,80,'\n');
136 string sjobid, username, jstatus;
141 MESSAGE("jobId = " << jobid << " " << jstatus);
142 END_OF("BatchManager_SLURM::queryJob");
146 void BatchManager_SLURM::buildSalomeCouplingScript( const char *fileToExecute ) throw(SALOME_Exception)
148 BEGIN_OF("BatchManager_SLURM::buildSalomeCouplingScript");
151 string::size_type p1 = string(fileToExecute).find_last_of("/");
152 string::size_type p2 = string(fileToExecute).find_last_of(".");
153 _fileNameToExecute = string(fileToExecute).substr(p1+1,p2-p1-1);
155 _TmpFileName = BuildTemporaryFileName();
156 ofstream tempOutputFile;
157 tempOutputFile.open(_TmpFileName.c_str(), ofstream::out );
158 tempOutputFile << "#! /bin/sh -f" << endl ;
159 tempOutputFile << "cd " ;
160 tempOutputFile << _params.applipath << endl ;
161 tempOutputFile << "export PYTHONPATH=~/" ;
162 tempOutputFile << _dirForTmpFiles ;
163 tempOutputFile << ":$PYTHONPATH" << endl ;
164 tempOutputFile << "if test $SLURM_PROCID = 0; then" << endl ;
165 tempOutputFile << " ./runAppli --terminal --batch --modules=" ;
166 for ( int i = 0 ; i < _params.modulesList.size() ; i++ ) {
167 tempOutputFile << _params.modulesList[i] ;
168 if ( i != _params.modulesList.size()-1 )
169 tempOutputFile << "," ;
171 tempOutputFile << " --standalone=registry,study,moduleCatalog --killall &" << endl ;
172 tempOutputFile << " for ((ip=1; ip < ${SLURM_NPROCS} ; ip++))" << endl;
173 tempOutputFile << " do" << endl ;
174 tempOutputFile << " arglist=\"$arglist YACS_Server_\"$ip" << endl ;
175 tempOutputFile << " done" << endl ;
176 tempOutputFile << " sleep 5" << endl ;
177 tempOutputFile << " ./runSession waitContainers.py $arglist" << endl ;
178 tempOutputFile << " ./runSession python ~/" << _dirForTmpFiles << "/" << _fileNameToExecute << ".py" << endl;
179 tempOutputFile << " ./runSession killCurrentPort" << endl;
180 tempOutputFile << "else" << endl ;
181 tempOutputFile << " sleep 5" << endl ;
182 tempOutputFile << " ./runSession waitNS.py" << endl ;
183 tempOutputFile << " ./runSession SALOME_Container 'YACS_Server_'${SLURM_PROCID}" << endl ;
184 tempOutputFile << "fi" << endl ;
185 tempOutputFile.flush();
186 tempOutputFile.close();
187 chmod(_TmpFileName.c_str(), 0x1ED);
188 SCRUTE(_TmpFileName.c_str()) ;
191 if( _params.protocol == "rsh" )
193 else if( _params.protocol == "ssh" )
196 throw SALOME_Exception("Unknown protocol");
198 command += _TmpFileName;
200 if (_params.username != ""){
201 command += _params.username;
204 command += _params.hostname;
206 command += _dirForTmpFiles ;
207 command += "/runSalome_" ;
208 command += _fileNameToExecute ;
209 command += "_Batch.sh" ;
210 SCRUTE(command.c_str());
211 status = system(command.c_str());
213 throw SALOME_Exception("Error of connection on remote host");
216 END_OF("BatchManager_SLURM::buildSalomeCouplingScript");
219 void BatchManager_SLURM::buildSalomeBatchScript( const int nbproc ) throw(SALOME_Exception)
221 BEGIN_OF("BatchManager_SLURM::buildSalomeBatchScript");
223 _TmpFileName = BuildTemporaryFileName();
224 ofstream tempOutputFile;
225 tempOutputFile.open(_TmpFileName.c_str(), ofstream::out );
227 tempOutputFile << "#! /bin/sh -f" << endl ;
228 tempOutputFile << "#BSUB -n " << nbproc << endl ;
229 tempOutputFile << "#BSUB -o ~/" << _dirForTmpFiles << "/runSalome.log%J" << endl ;
230 tempOutputFile << "mpirun -srun ~/" << _dirForTmpFiles << "/runSalome_" << _fileNameToExecute << "_Batch.sh" << endl ;
231 tempOutputFile.flush();
232 tempOutputFile.close();
233 chmod(_TmpFileName.c_str(), 0x1ED);
234 SCRUTE(_TmpFileName.c_str()) ;
237 if( _params.protocol == "rsh" )
239 else if( _params.protocol == "ssh" )
242 throw SALOME_Exception("Unknown protocol");
243 command += _TmpFileName;
245 if (_params.username != ""){
246 command += _params.username;
249 command += _params.hostname;
251 command += _dirForTmpFiles ;
253 command += _fileNameToExecute ;
254 command += "_Batch.sh" ;
255 SCRUTE(command.c_str());
256 status = system(command.c_str());
258 throw SALOME_Exception("Error of connection on remote host");
261 END_OF("BatchManager_SLURM::buildSalomeBatchScript");
265 int BatchManager_SLURM::submit() throw(SALOME_Exception)
267 BEGIN_OF("BatchManager_SLURM::submit");
269 // define name of log file
270 string logFile="/tmp/logs/";
271 logFile += getenv("USER");
272 logFile += "/batchSalome_";
274 srand ( time(NULL) );
278 logFile += oss.str();
284 // define command to submit batch
285 if( _params.protocol == "rsh" )
287 else if( _params.protocol == "ssh" )
290 throw SALOME_Exception("Unknown protocol");
292 if (_params.username != ""){
293 command += _params.username;
297 command += _params.hostname;
298 command += " \"bsub < " ;
299 command += _dirForTmpFiles ;
301 command += _fileNameToExecute ;
302 command += "_Batch.sh\" > ";
304 SCRUTE(command.c_str());
305 status = system(command.c_str());
307 throw SALOME_Exception("Error of connection on remote host");
309 // read id of submitted job in log file
311 FILE *fp = fopen(logFile.c_str(),"r");
312 fgets( line, 128, fp);
316 int p1 = sline.find("<");
317 int p2 = sline.find(">");
318 string strjob = sline.substr(p1+1,p2-p1-1);
321 istringstream iss(strjob);
324 END_OF("BatchManager_SLURM::submit");