From f688c5d43b7bd283c4a8e8ea759c30c87f155fa8 Mon Sep 17 00:00:00 2001 From: secher Date: Mon, 7 Dec 2009 15:23:10 +0000 Subject: [PATCH] LSF improvement and change copy files command --- src/Core/Batch_CommunicationProtocol.cxx | 4 +- src/Core/Batch_CommunicationProtocolSSH.cxx | 8 +- src/LSF/Batch_BatchManager_eLSF.cxx | 173 ++++++++++++-------- src/LSF/Batch_BatchManager_eLSF.hxx | 2 + 4 files changed, 116 insertions(+), 71 deletions(-) diff --git a/src/Core/Batch_CommunicationProtocol.cxx b/src/Core/Batch_CommunicationProtocol.cxx index ef804f7..530a5f3 100644 --- a/src/Core/Batch_CommunicationProtocol.cxx +++ b/src/Core/Batch_CommunicationProtocol.cxx @@ -127,9 +127,9 @@ namespace Batch { for (unsigned int i=0 ; i 0 ){ - string::size_type p1 = fileToExecute.find_last_of("/"); - string::size_type p2 = fileToExecute.find_last_of("."); - fileNameToExecute = fileToExecute.substr(p1+1,p2-p1-1); - } - else - fileNameToExecute = "command"; + string::size_type p1 = fileToExecute.find_last_of("/"); + string::size_type p2 = fileToExecute.find_last_of("."); + std::string fileNameToExecute = fileToExecute.substr(p1+1,p2-p1-1); // export input files on cluster + cerr << "Export des fichiers en entree" << endl; exportInputFiles(job); // build batch script for job + cerr << "Construction du script de batch" << endl; buildBatchScript(job); + cerr << "Script envoye" << endl; // define name of log file (local) string logFile = generateTemporaryFileName("LSF-submitlog"); // define command to submit batch - string subCommand = string("cd ") + dirForTmpFiles + "; bsub < " + - fileNameToExecute + "_Batch.sh"; + string subCommand = string("cd ") + workDir + "; bsub < " + fileNameToExecute + "_Batch.sh"; string command = _protocol.getExecCommand(subCommand, _hostname, _username); command += " > "; command += logFile; + command += " 2>&1"; cerr << command.c_str() << endl; status = system(command.c_str()); if(status) - throw EmulationException("Error of connection on remote host"); + { + ifstream error_message(logFile.c_str()); + std::string mess; + std::string temp; + while(std::getline(error_message, temp)) + mess += temp; + error_message.close(); + throw EmulationException("Error of connection on remote host, error was: " + mess); + } // read id of submitted job in log file char line[128]; @@ -205,33 +211,41 @@ namespace Batch { { #ifndef WIN32 //TODO: need for porting on Windows Parametre params = job.getParametre(); - Environnement env = job.getEnvironnement(); - const int nbproc = params[NBPROC]; - const long edt = params[MAXWALLTIME]; - const long mem = params[MAXRAMSIZE]; - const string workDir = params[WORKDIR]; - const std::string dirForTmpFiles = params[TMPDIR]; - const string fileToExecute = params[EXECUTABLE]; - const string home = params[HOMEDIR]; - const std::string queue = params[QUEUE]; - std::string rootNameToExecute; - std::string fileNameToExecute; - std::string filelogtemp; - if( fileToExecute.size() > 0 ){ - string::size_type p1 = fileToExecute.find_last_of("/"); - string::size_type p2 = fileToExecute.find_last_of("."); - rootNameToExecute = fileToExecute.substr(p1+1,p2-p1-1); - char* basec=strdup(fileToExecute.c_str()); - fileNameToExecute = "~/" + dirForTmpFiles + "/" + string(basename(basec)); - free(basec); - - int idx = dirForTmpFiles.find("Batch/"); - filelogtemp = dirForTmpFiles.substr(idx+6, dirForTmpFiles.length()); - } - else{ - rootNameToExecute = "command"; - } + // Job Parameters + string workDir = ""; + string fileToExecute = ""; + int nbproc = 0; + int edt = 0; + int mem = 0; + string queue = ""; + + // Mandatory parameters + if (params.find(WORKDIR) != params.end()) + workDir = params[WORKDIR].str(); + else + throw EmulationException("params[WORKDIR] is not defined ! Please defined it, cannot submit this job"); + if (params.find(EXECUTABLE) != params.end()) + fileToExecute = params[EXECUTABLE].str(); + else + throw EmulationException("params[EXECUTABLE] is not defined ! Please defined it, cannot submit this job"); + + // Optional parameters + if (params.find(NBPROC) != params.end()) + nbproc = params[NBPROC]; + if (params.find(MAXWALLTIME) != params.end()) + edt = params[MAXWALLTIME]; + if (params.find(MAXRAMSIZE) != params.end()) + mem = params[MAXRAMSIZE]; + if (params.find(QUEUE) != params.end()) + queue = params[QUEUE].str(); + + string::size_type p1 = fileToExecute.find_last_of("/"); + string::size_type p2 = fileToExecute.find_last_of("."); + string rootNameToExecute = fileToExecute.substr(p1+1,p2-p1-1); + string fileNameToExecute = fileToExecute.substr(p1+1); + + // Create batch submit file ofstream tempOutputFile; std::string TmpFileName = createAndOpenTemporaryFile("LSF-script", tempOutputFile); @@ -243,43 +257,48 @@ namespace Batch { if( mem > 0 ) tempOutputFile << "#BSUB -M " << mem*1024 << endl ; tempOutputFile << "#BSUB -n " << nbproc << endl ; - if( fileToExecute.size() > 0 ){ - tempOutputFile << "#BSUB -o " << home << "/" << dirForTmpFiles << "/output.log." << filelogtemp << endl ; - tempOutputFile << "#BSUB -e " << home << "/" << dirForTmpFiles << "/error.log." << filelogtemp << endl ; - } - else{ - tempOutputFile << "#BSUB -o " << dirForTmpFiles << "/" << env["LOGFILE"] << ".output.log" << endl ; - tempOutputFile << "#BSUB -e " << dirForTmpFiles << "/" << env["LOGFILE"] << ".error.log" << endl ; - } - if( workDir.size() > 0 ) - tempOutputFile << "cd " << workDir << endl ; - if( fileToExecute.size() > 0 ){ - tempOutputFile << _mpiImpl->boot("",nbproc); - tempOutputFile << _mpiImpl->run("",nbproc,fileNameToExecute); - tempOutputFile << _mpiImpl->halt(); - } - else{ - tempOutputFile << "source " << env["SOURCEFILE"] << endl ; - tempOutputFile << env["COMMAND"]; - } - + size_t pos = workDir.find("$HOME"); + string baseDir; + if( pos != string::npos ) + baseDir = getHomeDir(workDir) + workDir.substr(pos+5,workDir.length()-5); + else + baseDir = workDir; + tempOutputFile << "#BSUB -o " << baseDir << "/logs/output.log." << rootNameToExecute << endl ; + tempOutputFile << "#BSUB -e " << baseDir << "/logs/error.log." << rootNameToExecute << endl ; + + tempOutputFile << "cd " << workDir << endl ; + + // generate nodes file + tempOutputFile << "bool=0" << endl; + tempOutputFile << "for i in $LSB_MCPU_HOSTS; do" << endl; + tempOutputFile << " if test $bool = 0; then" << endl; + tempOutputFile << " n=$i" << endl; + tempOutputFile << " bool=1" << endl; + tempOutputFile << " else" << endl; + tempOutputFile << " for ((j=0;j<$i;j++)); do" << endl; + tempOutputFile << " echo $n >> nodesFile" << endl; + tempOutputFile << " done" << endl; + tempOutputFile << " bool=0" << endl; + tempOutputFile << " fi" << endl; + tempOutputFile << "done" << endl; + + // Abstraction of PBS_NODEFILE - TODO + tempOutputFile << "export LIBBATCH_NODEFILE=nodesFile" << endl; + + // Launch the executable + tempOutputFile << "./" + fileNameToExecute << endl; tempOutputFile.flush(); tempOutputFile.close(); -#ifdef WIN32 - _chmod( -#else - chmod( -#endif - TmpFileName.c_str(), 0x1ED); - cerr << TmpFileName.c_str() << endl; + + BATCH_CHMOD(TmpFileName.c_str(), 0x1ED); + cerr << "Batch script file generated is: " << TmpFileName.c_str() << endl; int status = _protocol.copyFile(TmpFileName, "", "", - dirForTmpFiles + "/" + rootNameToExecute + "_Batch.sh", + workDir + "/" + rootNameToExecute + "_Batch.sh", _hostname, _username); if (status) throw EmulationException("Error of connection on remote host"); - remove(TmpFileName.c_str()); #endif } @@ -297,4 +316,24 @@ namespace Batch { return oss.str(); } + std::string BatchManager_eLSF::getHomeDir(std::string tmpdir) + { + std::string home; + int idx = tmpdir.find("Batch/"); + std::string filelogtemp = tmpdir.substr(idx+6, tmpdir.length()); + filelogtemp = "/tmp/logs" + filelogtemp + "_home"; + + string subCommand = string("echo $HOME"); + string command = _protocol.getExecCommand(subCommand, _hostname, _username) + " > " + filelogtemp; + cerr << command.c_str() << endl; + int status = system(command.c_str()); + if (status) + throw EmulationException("Error of launching home command on remote host"); + + std::ifstream file_home(filelogtemp.c_str()); + std::getline(file_home, home); + file_home.close(); + return home; + } + } diff --git a/src/LSF/Batch_BatchManager_eLSF.hxx b/src/LSF/Batch_BatchManager_eLSF.hxx index 09a7f1c..6d48a0e 100644 --- a/src/LSF/Batch_BatchManager_eLSF.hxx +++ b/src/LSF/Batch_BatchManager_eLSF.hxx @@ -72,6 +72,8 @@ namespace Batch { private: + std::string getHomeDir(std::string tmpdir); + #ifdef SWIG public: // Recupere le l'identifiant d'un job deja soumis au BatchManager -- 2.39.2