From 5833f1dbea22498995de578a45c54c4359810ddc Mon Sep 17 00:00:00 2001 From: ribes Date: Thu, 28 Feb 2008 15:40:51 +0000 Subject: [PATCH] - Adding a test for batch machine before launching a job --- src/Launcher/BatchLight_BatchTest.cxx | 687 ++++++++++++++++++++++++++ src/Launcher/BatchLight_BatchTest.hxx | 60 +++ src/Launcher/Makefile.am | 26 +- src/Launcher/SALOME_Launcher.cxx | 9 +- src/Launcher/SALOME_Launcher.hxx | 1 + 5 files changed, 769 insertions(+), 14 deletions(-) create mode 100644 src/Launcher/BatchLight_BatchTest.cxx create mode 100644 src/Launcher/BatchLight_BatchTest.hxx diff --git a/src/Launcher/BatchLight_BatchTest.cxx b/src/Launcher/BatchLight_BatchTest.cxx new file mode 100644 index 000000000..954d33805 --- /dev/null +++ b/src/Launcher/BatchLight_BatchTest.cxx @@ -0,0 +1,687 @@ +#include "BatchLight_BatchTest.hxx" + +BatchLight_BatchTest::BatchLight_BatchTest(const Engines::MachineParameters& batch_descr) +{ + _batch_descr = batch_descr; + + // Getting date + Batch::Date date = Batch::Date(time(0)); + _date = date.str(); + int lend = _date.size() ; + int i = 0 ; + while (i < lend) + { + if (_date[i] == '/' || _date[i] == '-' || _date[i] == ':' ) + { + _date[i] = '_' ; + } + i++ ; + } + + // Creating test temporary file + _test_filename = "/tmp/"; + _test_filename += _date + "_test_cluster_file_"; + _test_filename += _batch_descr.alias.in(); + _base_filename = _date + "_test_cluster_file_" + _batch_descr.alias.in(); +} + +BatchLight_BatchTest::~BatchLight_BatchTest() {} + +bool +BatchLight_BatchTest::test() +{ + bool rtn = false; + INFOS(std::endl + << "--- Testing batch Machine :" << std::endl + << "--- Name : " << _batch_descr.hostname << std::endl + << "--- Alias : " << _batch_descr.alias << std::endl + << "--- Protocol : " << _batch_descr.protocol << std::endl + << "--- User Name : " << _batch_descr.username << std::endl + << "--- Batch Type : " << _batch_descr.batch << std::endl + << "--- MPI Impl : " << _batch_descr.mpiImpl << std::endl + << "--- Appli Path : " << _batch_descr.applipath << std::endl + ); + + std::string result_connection("Not Tested"); + std::string result_filecopy("Not Tested"); + std::string result_getresult("Not Tested"); + std::string result_jobsubmit_simple("Not Tested"); + std::string result_jobsubmit_mpi("Not Tested"); + std::string result_appli("Not Tested"); + + result_connection = test_connection(); + result_filecopy = test_filecopy(); + result_getresult = test_getresult(); + result_jobsubmit_simple = test_jobsubmit_simple(); + result_jobsubmit_mpi = test_jobsubmit_mpi(); + result_appli = test_appli(); + + INFOS(std::endl + << "--- Test results" << std::endl + << "--- Connection : " << result_connection << std::endl + << "--- File copy : " << result_filecopy << std::endl + << "--- Get results : " << result_getresult << std::endl + << "--- Submit simple job : " << result_jobsubmit_simple << std::endl + << "--- Submit mpi job : " << result_jobsubmit_mpi << std::endl + << "--- Application : " << result_appli << std::endl + ); + + if (result_connection == "OK" and + result_filecopy == "OK" and + result_getresult == "OK" and + result_jobsubmit_simple == "OK" and + result_jobsubmit_mpi == "OK" and + result_appli == "OK") + rtn = true; + + return rtn; +} + +// For this test we use : alias, protocol, username +std::string +BatchLight_BatchTest::test_connection() +{ + int status; + std::string command; + std::string result("Failed : "); + std::string alias = _batch_descr.alias.in(); + std::string username = _batch_descr.username.in(); + std::string protocol = _batch_descr.protocol.in(); + + // Basic tests + if(alias == "") + { + result += "alias is empty !"; + return result; + } + if(username == "") + { + result += "username is empty !"; + return result; + } + if( protocol != "rsh" and protocol != "ssh") + { + result += "protocol unknown ! (" + protocol + ")"; + return result; + } + + // Build command + command += protocol + + " " + + username + "@" + alias; + + // Test + status = system(command.c_str()); + if(status) { + std::ostringstream oss; + oss << status; + result += "Error of connection on remote host ! status = "; + result += oss.str(); + return result; + } + + result = "OK"; + return result; +} + +// For this test we use : alias, protocol, username +std::string +BatchLight_BatchTest::test_filecopy() +{ + int status; + std::string home; + std::string command; + std::string result("Failed : "); + std::string alias = _batch_descr.alias.in(); + std::string username = _batch_descr.username.in(); + std::string protocol = _batch_descr.protocol.in(); + + // Getting home directory + std::string rst = get_home(&home); + if(rst != "") { + result += rst; + return result; + } + + // Writing into the tempory file + command = "echo Hello > " + _test_filename; + status = system(command.c_str()); + if(status) { + std::ostringstream oss; + oss << status; + result += "Error in creating tempory file ! status = "; + result += oss.str(); + return result; + } + + // Build command + command = "scp"; + if(protocol == "rsh") + command = "rcp"; + command += " " + _test_filename + " " + + username + "@" + alias + ":" + home; + + // Test + status = system(command.c_str()); + if(status) { + std::ostringstream oss; + oss << status; + result += "Error in copy file on remote host ! status = "; + result += oss.str(); + return result; + } + + result = "OK"; + return result; +} + +// For this test we use : alias, protocol, username +std::string +BatchLight_BatchTest::test_getresult() +{ + int status; + std::string home; + std::string command; + std::string result("Failed : "); + std::string alias = _batch_descr.alias.in(); + std::string username = _batch_descr.username.in(); + std::string protocol = _batch_descr.protocol.in(); + + // Getting home directory + std::string rst = get_home(&home); + if(rst != "") { + result += rst; + return result; + } + + // Build command + command = "scp"; + if(protocol == "rsh") + command = "rcp"; + command += " " + username + "@" + alias + ":" + home + + "/" + _base_filename + " " + _test_filename + "_copy"; + + // Test + status = system(command.c_str()); + if(status) { + std::ostringstream oss; + oss << status; + result += "Error in copy file from remote host ! status = "; + result += oss.str(); + return result; + } + + // Compare files + std::ifstream src_file(_test_filename.c_str()); + if (!src_file) + { + result += "Error in reading temporary file ! filename = " + _test_filename; + return result; + } + std::string cp_filename = _test_filename + "_copy"; + std::ifstream cp_file(cp_filename.c_str()); + if (!cp_file) + { + result += "Error in reading temporary copy file ! filename = " + cp_filename; + return result; + } + std::string src_firstline; + std::string cp_firstline; + std::getline(src_file, src_firstline); + std::getline(cp_file, cp_firstline); + src_file.close(); + cp_file.close(); + if (src_firstline != cp_firstline) + { + result += "Error source file and copy file are not equa ! source = " + src_firstline + " copy = " + cp_firstline; + return result; + } + + result = "OK"; + return result; +} + +std::string +BatchLight_BatchTest::test_jobsubmit_simple() +{ + int status; + std::string home; + std::string command; + std::string result("Failed : "); + std::string alias = _batch_descr.alias.in(); + std::string username = _batch_descr.username.in(); + std::string protocol = _batch_descr.protocol.in(); + std::string batch_type = _batch_descr.batch.in(); + + // Basic test + if (batch_type == "slurm") + { + INFOS("test_jobsubmit_simple not yet implemented for slurm... return OK"); + result = "OK"; + return result; + } + if (batch_type != "pbs") + { + result += "Batch type unknown ! : " + batch_type; + return result; + } + + // PBS test + std::string _test_file_simple = _test_filename + "_simple"; + std::ofstream file; + file.open(_test_file_simple.c_str(), std::ofstream::out); + file << "#!/bin/bash\n" + << "#PBS -l nodes=1\n" + << "#PBS -l walltime=00:01:00\n" + << "#PBS -o " + _date + "_simple_output.log\n" + << "#PBS -e " + _date + "_simple_error.log\n" + << "echo Bonjour\n" + << "echo Error >&2\n"; + file.flush(); + file.close(); + + // Getting home directory + std::string rst = get_home(&home); + if(rst != "") { + result += rst; + return result; + } + + // Build command for copy + command = "scp"; + if(protocol == "rsh") + command = "rcp"; + command += " " + _test_file_simple + " " + + username + "@" + alias + ":" + home; + status = system(command.c_str()); + if(status) { + std::ostringstream oss; + oss << status; + result += "Error in copy job file to remote host ! status = "; + result += oss.str(); + return result; + } + + // Build command for submit job + std::string file_job_name = _test_filename + "_jobid"; + command = protocol + " " + username + "@" + alias + " qsub " + _base_filename + "_simple > " + file_job_name; + status = system(command.c_str()); + if(status) { + std::ostringstream oss; + oss << status; + result += "Error in sending qsub to remote host ! status = "; + result += oss.str(); + return result; + } + std::string jobid; + std::ifstream file_job(file_job_name.c_str()); + if (!file_job) + { + result += "Error in reading temporary file ! filename = " + file_job_name; + return result; + } + std::getline(file_job, jobid); + file_job.close(); + + // Wait the end of the job + command = protocol + " " + username + "@" + alias + " qstat -f " + jobid + " > " + file_job_name; + bool stop = false; + while (!stop) + { + status = system(command.c_str()); + if(status && status != 153 && status != 256*153) + { + std::ostringstream oss; + oss << status; + result += "Error in sending qstat to remote host ! status = "; + result += oss.str(); + return result; + } + + if(status == 153 || status == 256*153 ) + stop = true; + sleep(1); + } + + // Build command for getting results + command = "scp"; + if(protocol == "rsh") + command = "rcp"; + command += " " + + username + "@" + alias + ":" + home + "/" + _date + "_simple* /tmp"; + status = system(command.c_str()); + if(status) { + std::ostringstream oss; + oss << status; + result += "error in getting file result of qsub simple to remote host ! status = "; + result += oss.str(); + return result; + } + + // Test results + std::string normal_input; + std::string file_normal_name = "/tmp/" + _date + "_simple_output.log"; + std::ifstream file_normal(file_normal_name.c_str()); + if (!file_normal) + { + result += "Error in reading temporary file ! filename = " + file_normal_name; + return result; + } + std::getline(file_normal, normal_input); + file_normal.close(); + if (normal_input != "Bonjour") + { + result += "error from simple ouput file ! waiting for Bonjour and get : " + normal_input; + return result; + } + std::string error_input; + std::string file_error_name = "/tmp/" + _date + "_simple_error.log"; + std::ifstream file_error(file_error_name.c_str()); + if (!file_error) + { + result += "Error in reading temporary file ! filename = " + file_error_name; + return result; + } + std::getline(file_error, error_input); + file_error.close(); + if (error_input != "Error") + { + result += "error from simple error file ! waiting for Error and get : " + error_input; + return result; + } + result = "OK"; + return result; +} + +std::string +BatchLight_BatchTest::test_jobsubmit_mpi() +{ + int status; + std::string home; + std::string command; + MpiImpl * mpiImpl; + std::string result("Failed : "); + std::string alias = _batch_descr.alias.in(); + std::string username = _batch_descr.username.in(); + std::string protocol = _batch_descr.protocol.in(); + std::string batch_type = _batch_descr.batch.in(); + std::string mpi_type = _batch_descr.mpiImpl.in(); + + // Basic test + if(mpi_type == "lam") + mpiImpl = new MpiImpl_LAM(); + else if(mpi_type == "mpich1") + mpiImpl = new MpiImpl_MPICH1(); + else if(mpi_type == "mpich2") + mpiImpl = new MpiImpl_MPICH2(); + else if(mpi_type == "openmpi") + mpiImpl = new MpiImpl_OPENMPI(); + else + { + result += "Error MPI impl not supported : " + mpi_type; + return result; + } + + // SLURM not yet implemented... + if (batch_type == "slurm") + { + INFOS("test_jobsubmit_simple not yet implemented for slurm... return OK"); + result = "OK"; + return result; + } + + // MPI test + std::string _test_file_script = _test_filename + "_script"; + std::ofstream file_script; + file_script.open(_test_file_script.c_str(), std::ofstream::out); + file_script << "#!/bin/bash\n" + << "echo HELLO MPI\n"; + file_script.flush(); + file_script.close(); + chmod(_test_file_script.c_str(), 0x1ED); + + std::string _test_file_mpi = _test_filename + "_mpi"; + std::ofstream file_mpi; + file_mpi.open(_test_file_mpi.c_str(), std::ofstream::out); + file_mpi << "#!/bin/bash\n" + << "#PBS -l nodes=1\n" + << "#PBS -l walltime=00:01:00\n" + << "#PBS -o "<< _date << "_mpi_output.log\n" + << "#PBS -e " << _date << "_mpi_error.log\n" + << mpiImpl->boot("${PBS_NODEFILE}", 1) + << mpiImpl->run("${PBS_NODEFILE}", 1, _base_filename + "_script") + << mpiImpl->halt(); + file_mpi.flush(); + file_mpi.close(); + + // Getting home directory + std::string rst = get_home(&home); + if(rst != "") { + result += rst; + return result; + } + + // Build command for copy + command = "scp"; + if(protocol == "rsh") + command = "rcp"; + command += " " + _test_file_script + " " + + username + "@" + alias + ":" + home; + status = system(command.c_str()); + if(status) { + std::ostringstream oss; + oss << status; + result += "Error in copy job file to remote host ! status = "; + result += oss.str(); + return result; + } + command = "scp"; + if(protocol == "rsh") + command = "rcp"; + command += " " + _test_file_mpi + " " + + username + "@" + alias + ":" + home; + status = system(command.c_str()); + if(status) { + std::ostringstream oss; + oss << status; + result += "Error in copy job file to remote host ! status = "; + result += oss.str(); + return result; + } + + // Build command for submit job + std::string file_job_name = _test_filename + "_jobid"; + command = protocol + " " + username + "@" + alias + " qsub " + _base_filename + "_mpi > " + file_job_name; + status = system(command.c_str()); + if(status) { + std::ostringstream oss; + oss << status; + result += "Error in sending qsub to remote host ! status = "; + result += oss.str(); + return result; + } + std::string jobid; + std::ifstream file_job(file_job_name.c_str()); + if (!file_job) + { + result += "Error in reading temporary file ! filename = " + file_job_name; + return result; + } + std::getline(file_job, jobid); + file_job.close(); + + // Wait the end of the job + command = protocol + " " + username + "@" + alias + " qstat -f " + jobid + " > " + file_job_name; + bool stop = false; + while (!stop) + { + status = system(command.c_str()); + if(status && status != 153 && status != 256*153) + { + std::ostringstream oss; + oss << status; + result += "Error in sending qstat to remote host ! status = "; + result += oss.str(); + return result; + } + + if(status == 153 || status == 256*153 ) + stop = true; + sleep(1); + } + + // Build command for getting results + command = "scp"; + if(protocol == "rsh") + command = "rcp"; + command += " " + + username + "@" + alias + ":" + home + "/" + _date + "_mpi* /tmp"; + status = system(command.c_str()); + if(status) { + std::ostringstream oss; + oss << status; + result += "error in getting file result of qsub mpi from remote host ! status = "; + result += oss.str(); + return result; + } + + // Test results + std::string normal_input; + std::string file_normal_name = "/tmp/" + _date + "_mpi_output.log"; + std::ifstream file_normal(file_normal_name.c_str()); + if (!file_normal) + { + result += "Error in reading temporary file ! filename = " + file_normal_name; + return result; + } + bool test_ok = false; + while (std::getline(file_normal, normal_input)) + { + if (normal_input == "HELLO MPI") + test_ok = true; + } + file_normal.close(); + if (!test_ok) + { + result += "error from mpi ouput file ! waiting for HELLO MPI please watch /tmp/" + _date + "_mpi_output.log file"; + return result; + } + result = "OK"; + return result; +} + +std::string +BatchLight_BatchTest::test_appli() +{ + int status; + std::string home; + std::string command; + std::string result("Failed : "); + std::string alias = _batch_descr.alias.in(); + std::string username = _batch_descr.username.in(); + std::string protocol = _batch_descr.protocol.in(); + std::string applipath = _batch_descr.applipath.in(); + + // Getting home directory + std::string rst = get_home(&home); + if(rst != "") { + result += rst; + return result; + } + + std::string _test_file_appli = _test_filename + "_appli_test"; + std::ofstream file_appli; + file_appli.open(_test_file_appli.c_str(), std::ofstream::out); + file_appli << "#!/bin/bash\n" + << "if [ -f " << applipath << "/runAppli ]\n" + << "then\n" + << " echo OK\n" + << "else\n" + << " echo NOK\n" + << "fi\n"; + file_appli.flush(); + file_appli.close(); + + // Build command for copy + command = "scp"; + if(protocol == "rsh") + command = "rcp"; + command += " " + _test_file_appli + " " + + username + "@" + alias + ":" + home; + status = system(command.c_str()); + if(status) { + std::ostringstream oss; + oss << status; + result += "Error in copy appli test file to remote host ! status = "; + result += oss.str(); + return result; + } + + // Launch test + command = protocol + " " + username + "@" + alias + + " sh " + home + "/" + _base_filename + "_appli_test > " + + _test_filename + "_appli_test_result"; + + status = system(command.c_str()); + if(status) { + std::ostringstream oss; + oss << status; + result += "Error in launching appli test on remote host ! status = "; + result += oss.str(); + return result; + } + + // Read test result + std::string rst_appli; + std::string file_appli_result_name = _test_filename + "_appli_test_result"; + std::ifstream file_appli_result(file_appli_result_name.c_str()); + if (!file_appli_result) + { + result += "Error in reading temporary file ! filename = " + file_appli_result_name; + return result; + } + std::getline(file_appli_result, rst_appli); + file_appli_result.close(); + + if (rst_appli != "OK") + { + result += "Error checking application on remote host ! result = " + rst; + return result; + } + + result = "OK"; + return result; +} + +// Useful methods +std::string +BatchLight_BatchTest::get_home(std::string * home) +{ + int status; + std::string result = ""; + std::string command; + std::string alias = _batch_descr.alias.in(); + std::string username = _batch_descr.username.in(); + std::string protocol = _batch_descr.protocol.in(); + std::string file_home_name = _test_filename + "_home"; + + command = protocol + " " + username + "@" + alias + " 'echo $HOME' > " + file_home_name; + status = system(command.c_str()); + if(status) { + std::ostringstream oss; + oss << status; + result += "Error in getting home directory ! status = "; + result += oss.str(); + return result; + } + + std::ifstream file_home(file_home_name.c_str()); + if (!file_home) + { + result += "Error in reading temporary file ! filename = " + file_home_name; + return result; + } + std::getline(file_home, *home); + file_home.close(); + return result; +} diff --git a/src/Launcher/BatchLight_BatchTest.hxx b/src/Launcher/BatchLight_BatchTest.hxx new file mode 100644 index 000000000..0224965c7 --- /dev/null +++ b/src/Launcher/BatchLight_BatchTest.hxx @@ -0,0 +1,60 @@ +// Copyright (C) 2008 OPEN CASCADE, EADS/CCR, LIP6, CEA/DEN, +// CEDRAT, EDF R&D, LEG, PRINCIPIA R&D, BUREAU VERITAS +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License. +// +// This library is distributed in the hope that it will be useful +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// +// See http://www.salome-platform.org/ or email : webmaster.salome@opencascade.com +// + +#ifndef __BATCHLIGHT_BATCHTEST_HXX__ +#define __BATCHLIGHT_BATCHTEST_HXX__ + +#include +#include +#include + +#include +#include "utilities.h" +#include CORBA_CLIENT_HEADER(SALOME_ContainerManager) + +#include "Batch_Date.hxx" +#include "MpiImpl.hxx" + +class BatchLight_BatchTest +{ + public: + BatchLight_BatchTest(const Engines::MachineParameters& batch_descr); + virtual ~BatchLight_BatchTest(); + + bool test(); + + std::string test_connection(); + std::string test_filecopy(); + std::string test_getresult(); + std::string test_jobsubmit_simple(); + std::string test_jobsubmit_mpi(); + std::string test_appli(); + + protected: + std::string get_home(std::string * home); + + private: + Engines::MachineParameters _batch_descr; + std::string _test_filename; + std::string _base_filename; + std::string _date; +}; + +#endif diff --git a/src/Launcher/Makefile.am b/src/Launcher/Makefile.am index a78dc3edb..604a5c5df 100644 --- a/src/Launcher/Makefile.am +++ b/src/Launcher/Makefile.am @@ -36,12 +36,13 @@ include $(top_srcdir)/salome_adm/unix/make_common_starter.am # # header files salomeinclude_HEADERS = \ - BatchLight_BatchManager.hxx \ - BatchLight_BatchManager_PBS.hxx \ - BatchLight_BatchManager_SLURM.hxx \ - BatchLight_Job.hxx \ - MpiImpl.hxx \ - SALOME_Launcher.hxx + BatchLight_BatchManager.hxx \ + BatchLight_BatchManager_PBS.hxx \ + BatchLight_BatchManager_SLURM.hxx \ + BatchLight_Job.hxx \ + MpiImpl.hxx \ + BatchLight_BatchTest.hxx \ + SALOME_Launcher.hxx # Scripts to be installed dist_salomescript_DATA = @@ -97,12 +98,13 @@ COMMON_LIBS =\ # lib_LTLIBRARIES = libSalomeLauncher.la libSalomeLauncher_la_SOURCES=\ - SALOME_Launcher.cxx \ - BatchLight_BatchManager.cxx \ - BatchLight_BatchManager_SLURM.cxx \ - BatchLight_BatchManager_PBS.cxx \ - BatchLight_Job.cxx \ - MpiImpl.cxx + SALOME_Launcher.cxx \ + BatchLight_BatchManager.cxx \ + BatchLight_BatchManager_SLURM.cxx \ + BatchLight_BatchManager_PBS.cxx \ + BatchLight_Job.cxx \ + MpiImpl.cxx \ + BatchLight_BatchTest.cxx libSalomeLauncher_la_CPPFLAGS =\ $(COMMON_CPPFLAGS) diff --git a/src/Launcher/SALOME_Launcher.cxx b/src/Launcher/SALOME_Launcher.cxx index a507c010d..c0ac5f2ac 100644 --- a/src/Launcher/SALOME_Launcher.cxx +++ b/src/Launcher/SALOME_Launcher.cxx @@ -140,8 +140,13 @@ CORBA::Long SALOME_Launcher::submitSalomeJob( const char * fileToExecute , std::map < string, BatchLight::BatchManager * >::const_iterator it = _batchmap.find(clustername); if(it == _batchmap.end()) { - _batchmap[clustername] = FactoryBatchManager(p); - // TODO: Add a test for the cluster ! + BatchLight_BatchTest t(*p); + if (t.test()) + _batchmap[clustername] = FactoryBatchManager(p); + else + { + throw SALOME_Exception("Test of the batch machine failed - see messages in the SALOME_Launcher log"); + } } // create and submit job on cluster diff --git a/src/Launcher/SALOME_Launcher.hxx b/src/Launcher/SALOME_Launcher.hxx index 4bc5d65ce..25e65e25a 100644 --- a/src/Launcher/SALOME_Launcher.hxx +++ b/src/Launcher/SALOME_Launcher.hxx @@ -24,6 +24,7 @@ #include CORBA_CLIENT_HEADER(SALOME_ContainerManager) #include "SALOME_ContainerManager.hxx" #include "BatchLight_BatchManager.hxx" +#include "BatchLight_BatchTest.hxx" #include -- 2.39.2