From: Renaud Barate Date: Wed, 10 Jul 2019 12:03:08 +0000 (+0200) Subject: Properly distinguish errors from finished jobs X-Git-Tag: V2_4_2~2 X-Git-Url: http://git.salome-platform.org/gitweb/?a=commitdiff_plain;h=f35335f16bf8a0af2478f313824b71499fca4e5d;p=tools%2Flibbatch.git Properly distinguish errors from finished jobs --- diff --git a/src/Slurm/BatchManager_Slurm.cxx b/src/Slurm/BatchManager_Slurm.cxx index 2829ca6..a415da4 100644 --- a/src/Slurm/BatchManager_Slurm.cxx +++ b/src/Slurm/BatchManager_Slurm.cxx @@ -216,17 +216,44 @@ namespace Batch { JobInfo BatchManager_Slurm::queryJob(const JobId & jobid) { - // define command to query batch - string subCommand = "squeue -o %t -j " + jobid.getReference(); + // First try to query the job with "squeue" command + string subCommand = "squeue -h -o %T -j " + jobid.getReference() + " 2>/dev/null"; string command = _protocol.getExecCommand(subCommand, _hostname, _username); LOG(command); string output; - Utils::getCommandOutput(command, output); - // We don't test the return code here because with jobs finished since a long time Slurm - // returns an error and a message like "slurm_load_jobs error: Invalid job id specified". - // So we consider that the job is finished when we get an error. + int status = Utils::getCommandOutput(command, output); + LOG("status: " << status << ", output: " << output); + bool found = false; + JobInfo jobinfo; + if (status == 0) { + try { + jobinfo = JobInfo_Slurm(jobid.getReference(), output); + found = true; + } catch (const RunTimeException & exc) { + LOG(exc); + } + } - JobInfo_Slurm jobinfo = JobInfo_Slurm(jobid.getReference(), output); + // If "squeue" failed, the job may be finished. In this case, try to query the job with + // "sacct". + if (not found) { + string subCommand = "sacct -X -o State%-10 -n -j " + jobid.getReference(); + string command = _protocol.getExecCommand(subCommand, _hostname, _username); + LOG(command); + string output; + int status = Utils::getCommandOutput(command, output); + LOG("status: " << status << ", output: " << output); + if (status == 0) { + try { + jobinfo = JobInfo_Slurm(jobid.getReference(), output); + } catch (const RunTimeException & exc) { + LOG(exc); + throw(exc); + } + } else { + throw RunTimeException("sacct command failed with return code: " + status); + } + } return jobinfo; } diff --git a/src/Slurm/JobInfo_Slurm.cxx b/src/Slurm/JobInfo_Slurm.cxx index 9d398c0..ad6cb0c 100644 --- a/src/Slurm/JobInfo_Slurm.cxx +++ b/src/Slurm/JobInfo_Slurm.cxx @@ -35,6 +35,16 @@ using namespace std; +// Utility function to test if string str1 starts with str2 +bool starts_with(const std::string & str1, const std::string & str2) +{ + if (str1.length() < str2.length()) { + return false; + } + return (str1.substr(0, str2.length()) == str2); +} + + namespace Batch { JobInfo_Slurm::JobInfo_Slurm(const std::string & id, const std::string & queryOutput) @@ -42,39 +52,54 @@ namespace Batch { { _param[ID] = id; - // read query output, status should be on the second line - istringstream iss(queryOutput); - string status; - for (int i=0 ; i<2 ; i++) - getline(iss, status); - - if (status.size() == 0) { - // On some batch managers, the job is deleted as soon as it is finished, - // so we have to consider that an unknown job is a finished one, even if - // it is not always true. - _param[STATE] = FINISHED; - } else if (status == "CA") { // Canceled + // We test only the beginning of the string because some extra info can be added by sacct + // command (e.g. CANCELLED+) + if (starts_with(queryOutput, "BOOT_FAIL")) { + _param[STATE] = FAILED; + } else if (starts_with(queryOutput, "CANCELLED")) { _param[STATE] = FAILED; - } else if (status == "CD") { // Completed + } else if (starts_with(queryOutput, "COMPLETED")) { _param[STATE] = FINISHED; - } else if (status == "CF") { // Configuring - _param[STATE] = QUEUED; - } else if (status == "CG") { // Completing + } else if (starts_with(queryOutput, "CONFIGURI")) { + _param[STATE] = RUNNING; + } else if (starts_with(queryOutput, "COMPLETIN")) { _param[STATE] = RUNNING; - } else if (status == "F") { // Failed + } else if (starts_with(queryOutput, "DEADLINE")) { + _param[STATE] = FAILED; + } else if (starts_with(queryOutput, "FAILED")) { + _param[STATE] = FAILED; + } else if (starts_with(queryOutput, "NODE_FAIL")) { _param[STATE] = FAILED; - } else if (status == "NF") { // Node Fail + } else if (starts_with(queryOutput, "OUT_OF_ME")) { _param[STATE] = FAILED; - } else if (status == "PD") { // Pending + } else if (starts_with(queryOutput, "PENDING")) { _param[STATE] = QUEUED; - } else if (status == "R") { // Running + } else if (starts_with(queryOutput, "PREEMPTED")) { + _param[STATE] = FAILED; + } else if (starts_with(queryOutput, "RUNNING")) { _param[STATE] = RUNNING; - } else if (status == "S") { // Suspended + } else if (starts_with(queryOutput, "RESV_DEL_")) { + _param[STATE] = PAUSED; + } else if (starts_with(queryOutput, "REQUEUE")) { + _param[STATE] = PAUSED; + } else if (starts_with(queryOutput, "RESIZING")) { + _param[STATE] = PAUSED; + } else if (starts_with(queryOutput, "REVOKED")) { + _param[STATE] = FAILED; + } else if (starts_with(queryOutput, "SIGNALING")) { + _param[STATE] = FAILED; + } else if (starts_with(queryOutput, "SPECIAL_E")) { + _param[STATE] = FAILED; + } else if (starts_with(queryOutput, "STAGE_OUT")) { + _param[STATE] = FAILED; + } else if (starts_with(queryOutput, "STOPPED")) { + _param[STATE] = FAILED; + } else if (starts_with(queryOutput, "SUSPENDED")) { _param[STATE] = PAUSED; - } else if (status == "TO") { // Timeout + } else if (starts_with(queryOutput, "TIMEOUT")) { _param[STATE] = FAILED; } else { - throw RunTimeException("Unknown job state code: \"" + status + "\""); + throw RunTimeException("Unknown job state: \"" + queryOutput + "\""); } }