From 3087ca4927d3d948d98e3ce70cbc755af9b1e5bd Mon Sep 17 00:00:00 2001 From: barate Date: Mon, 30 Jan 2012 16:21:43 +0000 Subject: [PATCH] Get error messages from Slurm submission (fix bug PAL #2089) --- src/Core/Batch_Utils.cxx | 64 +++++++++++++++++++++++++ src/Core/Batch_Utils.hxx | 54 +++++++++++++++++++++ src/Core/CMakeLists.txt | 1 + src/Slurm/Batch_BatchManager_eSlurm.cxx | 59 ++++++++--------------- src/Slurm/Batch_JobInfo_eSlurm.cxx | 18 ++----- src/Slurm/Batch_JobInfo_eSlurm.hxx | 2 +- 6 files changed, 146 insertions(+), 52 deletions(-) create mode 100644 src/Core/Batch_Utils.cxx create mode 100644 src/Core/Batch_Utils.hxx diff --git a/src/Core/Batch_Utils.cxx b/src/Core/Batch_Utils.cxx new file mode 100644 index 0000000..3b706d2 --- /dev/null +++ b/src/Core/Batch_Utils.cxx @@ -0,0 +1,64 @@ +// Copyright (C) 2007-2011 CEA/DEN, EDF R&D, OPEN CASCADE +// +// Copyright (C) 2003-2007 OPEN CASCADE, EADS/CCR, LIP6, CEA/DEN, +// CEDRAT, EDF R&D, LEG, PRINCIPIA R&D, BUREAU VERITAS +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// +// See http://www.salome-platform.org/ or email : webmaster.salome@opencascade.com +// +/* + * Batch_Utils.cxx + * + * Created on: 30 jan. 2012 + * Author : Renaud BARATE - EDF R&D + */ + +#include + +#include +#include "Batch_Utils.hxx" + +#ifdef MSVC +#define popen _popen +#define pclose _pclose +#endif + +using namespace std; +namespace Batch { + +int Utils::getCommandOutput(const string & command, string & output) +{ + // Reinitialize output + output = ""; + + // Call command + FILE * fp = popen(command.c_str(), "r"); + if (fp == NULL) { + return -1; + } + + // Read the output and store it + char buf[1024]; + while (fgets(buf, sizeof(buf), fp) != NULL) { + output += buf; + } + + // close and get status + int status = pclose(fp); + return status; +} + +} diff --git a/src/Core/Batch_Utils.hxx b/src/Core/Batch_Utils.hxx new file mode 100644 index 0000000..24f17d1 --- /dev/null +++ b/src/Core/Batch_Utils.hxx @@ -0,0 +1,54 @@ +// Copyright (C) 2007-2011 CEA/DEN, EDF R&D, OPEN CASCADE +// +// Copyright (C) 2003-2007 OPEN CASCADE, EADS/CCR, LIP6, CEA/DEN, +// CEDRAT, EDF R&D, LEG, PRINCIPIA R&D, BUREAU VERITAS +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// +// See http://www.salome-platform.org/ or email : webmaster.salome@opencascade.com +// +/* + * Batch_Utils.hxx + * + * Created on: 30 jan. 2012 + * Author : Renaud BARATE - EDF R&D + */ + +#ifndef BATCH_UTILS_HXX_ +#define BATCH_UTILS_HXX_ + +#include + +namespace Batch { + +class Utils { +public: + + /** + * Call a command with the system shell and stores its output in parameter "output". + * Returns the return code of the command. + */ + static int getCommandOutput(const std::string & command, std::string & output); + +private: + + // No instanciation possible as this class provides only static methods + Utils() { } + +}; + +} + +#endif /* BATCH_UTILS_HXX_ */ diff --git a/src/Core/CMakeLists.txt b/src/Core/CMakeLists.txt index 1e4f5c2..7d13826 100644 --- a/src/Core/CMakeLists.txt +++ b/src/Core/CMakeLists.txt @@ -54,6 +54,7 @@ SET(CLASS_LIST Core/Batch_APIInternalFailureException Core/Batch_StringType Core/Batch_TypeMismatchException Core/Batch_Versatile + Core/Batch_Utils ) APPEND_CLASSES_TO_SRC_FILES(${CLASS_LIST}) diff --git a/src/Slurm/Batch_BatchManager_eSlurm.cxx b/src/Slurm/Batch_BatchManager_eSlurm.cxx index e677509..4cbe8c9 100644 --- a/src/Slurm/Batch_BatchManager_eSlurm.cxx +++ b/src/Slurm/Batch_BatchManager_eSlurm.cxx @@ -27,13 +27,12 @@ */ #include -#include #include #include #include +#include -#include "Batch_FactBatchManager_eSlurm.hxx" #include "Batch_BatchManager_eSlurm.hxx" #include "Batch_JobInfo_eSlurm.hxx" @@ -60,7 +59,6 @@ namespace Batch { // Method to submit a job to the batch manager const JobId BatchManager_eSlurm::submitJob(const Job & job) { - int status; Parametre params = job.getParametre(); const string workDir = params[WORKDIR]; @@ -70,38 +68,27 @@ namespace Batch { // build command file to submit the job and copy it on the server string cmdFile = buildCommandFile(job); - // define name of log file (local) - string logFile = generateTemporaryFileName("slurm-submitlog"); - // define command to submit batch string subCommand = string("cd ") + workDir + "; sbatch " + cmdFile; string command = _protocol.getExecCommand(subCommand, _hostname, _username); - command += " > "; - command += logFile; - cerr << command.c_str() << endl; - status = system(command.c_str()); - if (status) - { - ifstream error_message(logFile.c_str()); - string mess; - string temp; - while(getline(error_message, temp)) - mess += temp; - error_message.close(); - throw EmulationException("Error of connection on remote host, error was: " + mess); - } - - // read id of submitted job in log file - string jobref; - ifstream idfile(logFile.c_str()); - string line; - while (idfile && line.compare(0, 20, "Submitted batch job ") != 0) - getline(idfile, line); - idfile.close(); - if (line.compare(0, 20, "Submitted batch job ") == 0) - jobref = line.substr(20); - if (jobref.size() == 0) + command += " 2>&1"; + cout << command.c_str() << endl; + + // submit job + string output; + int status = Utils::getCommandOutput(command, output); + cout << output; + if (status != 0) throw EmulationException("Can't submit job, error was: " + output); + + // find id of submitted job in output + string search = "Submitted batch job "; + string::size_type pos = output.find(search); + if (pos == string::npos) throw EmulationException("Error in the submission of the job on the remote host"); + pos += search.size(); + string::size_type endl_pos = output.find('\n', pos); + string::size_type count = (endl_pos == string::npos)? string::npos : endl_pos - pos; + string jobref = output.substr(pos, count); JobId id(this, jobref); return id; @@ -240,21 +227,17 @@ namespace Batch { JobInfo BatchManager_eSlurm::queryJob(const JobId & jobid) { - // define name of log file (local) - string logFile = generateTemporaryFileName("slurm-querylog-" + jobid.getReference()); - // define command to query batch string subCommand = "squeue -o %t -j " + jobid.getReference(); string command = _protocol.getExecCommand(subCommand, _hostname, _username); - command += " > "; - command += logFile; cerr << command.c_str() << endl; - system(command.c_str()); + string output; + Utils::getCommandOutput(command, output); // We don't test the return code here because with jobs finished since a long time Slurm // returns an error and a message like "slurm_load_jobs error: Invalid job id specified". // So we consider that the job is finished when we get an error. - JobInfo_eSlurm jobinfo = JobInfo_eSlurm(jobid.getReference(), logFile); + JobInfo_eSlurm jobinfo = JobInfo_eSlurm(jobid.getReference(), output); return jobinfo; } diff --git a/src/Slurm/Batch_JobInfo_eSlurm.cxx b/src/Slurm/Batch_JobInfo_eSlurm.cxx index 815279e..ac53a4d 100644 --- a/src/Slurm/Batch_JobInfo_eSlurm.cxx +++ b/src/Slurm/Batch_JobInfo_eSlurm.cxx @@ -26,8 +26,6 @@ * Author : Renaud BARATE - EDF R&D */ -#include -#include #include #include @@ -39,22 +37,16 @@ using namespace std; namespace Batch { - JobInfo_eSlurm::JobInfo_eSlurm(const std::string & id, const std::string & logFile) + JobInfo_eSlurm::JobInfo_eSlurm(const std::string & id, const std::string & queryOutput) : JobInfo() { _param[ID] = id; - // read log file - ifstream log(logFile.c_str()); - string line; - - // status should be on the second line - for (int i=0 ; i<2 ; i++) - getline(log, line); - log.close(); + // read query output, status should be on the second line + istringstream iss(queryOutput); string status; - istringstream iss(line); - iss >> status; + for (int i=0 ; i<2 ; i++) + getline(iss, status); if (status.size() == 0) { // On some batch managers, the job is deleted as soon as it is finished, diff --git a/src/Slurm/Batch_JobInfo_eSlurm.hxx b/src/Slurm/Batch_JobInfo_eSlurm.hxx index c97ea97..6c1a732 100644 --- a/src/Slurm/Batch_JobInfo_eSlurm.hxx +++ b/src/Slurm/Batch_JobInfo_eSlurm.hxx @@ -39,7 +39,7 @@ namespace Batch { { public: - JobInfo_eSlurm(const std::string & id, const std::string & logFile); + JobInfo_eSlurm(const std::string & id, const std::string & queryOutput); virtual ~JobInfo_eSlurm(); }; -- 2.39.2