1 // Copyright (C) 2005 OPEN CASCADE, EADS/CCR, LIP6, CEA/DEN,
2 // CEDRAT, EDF R&D, LEG, PRINCIPIA R&D, BUREAU VERITAS
4 // This library is free software; you can redistribute it and/or
5 // modify it under the terms of the GNU Lesser General Public
6 // License as published by the Free Software Foundation; either
7 // version 2.1 of the License.
9 // This library is distributed in the hope that it will be useful
10 // but WITHOUT ANY WARRANTY; without even the implied warranty of
11 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 // Lesser General Public License for more details.
14 // You should have received a copy of the GNU Lesser General Public
15 // License along with this library; if not, write to the Free Software
16 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 // See http://www.salome-platform.org/ or email : webmaster.salome@opencascade.com
23 * Auteur : Bernard SECHER - CEA/DEN
29 #include "BatchLight_BatchManager_PBS.hxx"
30 #include "utilities.h"
31 #include "BatchLight_Job.hxx"
39 namespace BatchLight {
42 BatchManager_PBS::BatchManager_PBS(const batchParams& p) throw(SALOME_Exception) : BatchManager(p)
44 // pbs batch system needs to know mpi implementation
45 _mpiImpl = FactoryMpiImpl(_params.mpiImpl);
49 BatchManager_PBS::~BatchManager_PBS()
51 MESSAGE("BatchManager_PBS destructor "<<_params.hostname);
54 // Methode pour le controle des jobs : retire un job du gestionnaire
55 void BatchManager_PBS::deleteJob(const int & jobid)
57 BEGIN_OF("BatchManager_PBS::deleteJob");
63 // define command to submit batch
64 if( _params.protocol == "rsh" )
66 else if( _params.protocol == "ssh" )
69 throw SALOME_Exception("Unknown protocol");
71 if (_params.username != ""){
72 command += _params.username;
76 command += _params.hostname;
77 command += " \"qdel " ;
80 SCRUTE(command.c_str());
81 status = system(command.c_str());
83 throw SALOME_Exception("Error of connection on remote host");
85 MESSAGE("jobId = " << jobid << "killed");
86 END_OF("BatchManager_PBS::deleteJob");
89 // Methode pour le controle des jobs : renvoie l'etat du job
90 string BatchManager_PBS::queryJob(const int & jobid)
92 BEGIN_OF("BatchManager_PBS::queryJob");
93 // define name of log file
95 string logFile="/tmp/logs/";
96 logFile += getenv("USER");
97 logFile += "/batchSalome_";
103 logFile += oss.str();
109 // define command to submit batch
110 if( _params.protocol == "rsh" )
112 else if( _params.protocol == "ssh" )
115 throw SALOME_Exception("Unknown protocol");
117 if (_params.username != ""){
118 command += _params.username;
122 command += _params.hostname;
123 command += " \"qstat -f " ;
126 command += oss2.str();
129 SCRUTE(command.c_str());
130 status = system(command.c_str());
131 if(status && status != 153 && status != 256*153){
132 MESSAGE("status="<<status);
133 throw SALOME_Exception("Error of connection on remote host");
136 if(status == 153 || status == 256*153 )
137 // If job is finished qstat command return 153 status
140 // read status of job in log file
142 ifstream fp(logFile.c_str(),ios::in);
145 int pos = string::npos;
146 while( (pos == string::npos) && fp.getline(line,80,'\n') ){
147 sline = string(line);
148 pos = sline.find("job_state");
151 if(pos!=string::npos){
152 istringstream iss(sline);
161 MESSAGE("jobId = " << jobid << " " << jstatus);
162 END_OF("BatchManager_PBS::queryJob");
166 void BatchManager_PBS::buildSalomeCouplingScript( const char *fileToExecute ) throw(SALOME_Exception)
168 BEGIN_OF("BatchManager_PBS::buildSalomeCouplingScript");
171 string::size_type p1 = string(fileToExecute).find_last_of("/");
172 string::size_type p2 = string(fileToExecute).find_last_of(".");
173 _fileNameToExecute = string(fileToExecute).substr(p1+1,p2-p1-1);
175 _TmpFileName = BuildTemporaryFileName();
176 ofstream tempOutputFile;
177 tempOutputFile.open(_TmpFileName.c_str(), ofstream::out );
178 tempOutputFile << "#! /bin/sh -f" << endl ;
179 tempOutputFile << "cd " ;
180 tempOutputFile << _params.applipath << endl ;
181 tempOutputFile << "export PYTHONPATH=~/" ;
182 tempOutputFile << _dirForTmpFiles ;
183 tempOutputFile << ":$PYTHONPATH" << endl ;
184 tempOutputFile << "if test " ;
185 tempOutputFile << _mpiImpl->rank() ;
186 tempOutputFile << " = 0; then" << endl ;
187 tempOutputFile << " ./runAppli --terminal --batch --modules=" ;
188 for ( int i = 0 ; i < _params.modulesList.size() ; i++ ) {
189 tempOutputFile << _params.modulesList[i] ;
190 if ( i != _params.modulesList.size()-1 )
191 tempOutputFile << "," ;
193 tempOutputFile << " --standalone=registry,study,moduleCatalog --killall &" << endl ;
194 tempOutputFile << " for ((ip=1; ip < ";
195 tempOutputFile << _mpiImpl->size();
196 tempOutputFile << " ; ip++))" << endl;
197 tempOutputFile << " do" << endl ;
198 tempOutputFile << " arglist=\"$arglist YACS_Server_\"$ip" << endl ;
199 tempOutputFile << " done" << endl ;
200 tempOutputFile << " sleep 5" << endl ;
201 tempOutputFile << " ./runSession waitContainers.py $arglist" << endl ;
202 tempOutputFile << " ./runSession python ~/" << _dirForTmpFiles << "/" << _fileNameToExecute << ".py" << endl;
203 tempOutputFile << " ./runSession killCurrentPort" << endl;
204 tempOutputFile << "else" << endl ;
205 tempOutputFile << " sleep 5" << endl ;
206 tempOutputFile << " ./runSession waitNS.py" << endl ;
207 tempOutputFile << " ./runSession SALOME_Container 'YACS_Server_'";
208 tempOutputFile << _mpiImpl->rank() << endl ;
209 tempOutputFile << "fi" << endl ;
210 tempOutputFile.flush();
211 tempOutputFile.close();
212 chmod(_TmpFileName.c_str(), 0x1ED);
213 SCRUTE(_TmpFileName.c_str()) ;
216 if( _params.protocol == "rsh" )
218 else if( _params.protocol == "ssh" )
221 throw SALOME_Exception("Unknown protocol");
223 command += _TmpFileName;
225 if (_params.username != ""){
226 command += _params.username;
229 command += _params.hostname;
231 command += _dirForTmpFiles ;
232 command += "/runSalome_" ;
233 command += _fileNameToExecute ;
234 command += "_Batch.sh" ;
235 SCRUTE(_fileNameToExecute) ;
236 SCRUTE(command.c_str());
237 status = system(command.c_str());
239 throw SALOME_Exception("Error of connection on remote host");
242 END_OF("BatchManager_PBS::buildSalomeCouplingScript");
245 void BatchManager_PBS::buildSalomeBatchScript( const int nbproc ) throw(SALOME_Exception)
247 BEGIN_OF("BatchManager_PBS::buildSalomeBatchScript");
250 int nbmaxproc = _params.nbnodes * _params.nbprocpernode;
251 if( nbproc > nbmaxproc ){
252 MESSAGE(nbproc << " processors asked on a cluster of " << nbmaxproc << " processors");
253 throw SALOME_Exception("Too much processors asked for that cluster");
257 if( nbproc < _params.nbnodes )
260 nbnodes = _params.nbnodes;
262 _TmpFileName = BuildTemporaryFileName();
263 ofstream tempOutputFile;
264 tempOutputFile.open(_TmpFileName.c_str(), ofstream::out );
266 ostringstream filenameToExecute;
267 filenameToExecute << " ~/" << _dirForTmpFiles << "/runSalome_" << _fileNameToExecute << "_Batch.sh";
269 tempOutputFile << "#! /bin/sh -f" << endl ;
270 tempOutputFile << "#PBS -l nodes=" << nbnodes << endl ;
271 tempOutputFile << "#PBS -o ~/" << _dirForTmpFiles << "/runSalome.log${PBS_JOBID}" << endl ;
272 tempOutputFile << _mpiImpl->boot("${PBS_NODEFILE}",nbnodes);
273 tempOutputFile << _mpiImpl->run("${PBS_NODEFILE}",nbproc,filenameToExecute.str());
274 tempOutputFile << _mpiImpl->halt();
275 tempOutputFile.flush();
276 tempOutputFile.close();
277 chmod(_TmpFileName.c_str(), 0x1ED);
278 SCRUTE(_TmpFileName.c_str()) ;
281 if( _params.protocol == "rsh" )
283 else if( _params.protocol == "ssh" )
286 throw SALOME_Exception("Unknown protocol");
287 command += _TmpFileName;
289 if (_params.username != ""){
290 command += _params.username;
293 command += _params.hostname;
295 command += _dirForTmpFiles ;
297 command += _fileNameToExecute ;
298 command += "_Batch.sh" ;
299 SCRUTE(command.c_str());
300 status = system(command.c_str());
302 throw SALOME_Exception("Error of connection on remote host");
305 END_OF("BatchManager_PBS::buildSalomeBatchScript");
309 int BatchManager_PBS::submit() throw(SALOME_Exception)
311 BEGIN_OF("BatchManager_PBS::submit");
313 // define name of log file
314 string logFile="/tmp/logs/";
315 logFile += getenv("USER");
316 logFile += "/batchSalome_";
318 srand ( time(NULL) );
322 logFile += oss.str();
328 // define command to submit batch
329 if( _params.protocol == "rsh" )
331 else if( _params.protocol == "ssh" )
334 throw SALOME_Exception("Unknown protocol");
336 if (_params.username != ""){
337 command += _params.username;
341 command += _params.hostname;
342 command += " \"qsub " ;
343 command += _dirForTmpFiles ;
345 command += _fileNameToExecute ;
346 command += "_Batch.sh\" > ";
348 SCRUTE(command.c_str());
349 status = system(command.c_str());
351 throw SALOME_Exception("Error of connection on remote host");
353 // read id of submitted job in log file
355 FILE *fp = fopen(logFile.c_str(),"r");
356 fgets( line, 128, fp);
360 int pos = sline.find(".");
362 if(pos == string::npos)
365 strjob = sline.substr(0,pos);
368 istringstream iss(strjob);
371 END_OF("BatchManager_PBS::submit");