1 // Copyright (C) 2011-2012 EDF R&D
3 // This library is free software; you can redistribute it and/or
4 // modify it under the terms of the GNU Lesser General Public
5 // License as published by the Free Software Foundation; either
6 // version 2.1 of the License.
8 // This library is distributed in the hope that it will be useful,
9 // but WITHOUT ANY WARRANTY; without even the implied warranty of
10 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 // Lesser General Public License for more details.
13 // You should have received a copy of the GNU Lesser General Public
14 // License along with this library; if not, write to the Free Software
15 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 // See http://www.salome-platform.org/ or email : webmaster.salome@opencascade.com
19 // Authors : Guillaume Boulant (EDF) - 01/03/2011
28 #include "MeshJobManager_i.hxx"
30 #include <SALOMEconfig.h>
31 #include CORBA_SERVER_HEADER(SALOME_Exception)
34 #include "Basics_Utils.hxx" // For standard logging
36 #include "SALOME_KernelServices.hxx" // For CORBA logging
42 // ====================================================================
43 // General purpose helper functions (to put elsewhere at least)
44 // ====================================================================
48 * This function must be used to associate a datetime tag to a job
52 static long timetag() {
55 long tag = tv.tv_usec + tv.tv_sec*1000000;
61 * This function returns true if the string text starts with the string
64 static bool myStartsWith(const std::string& text,const std::string& token){
65 if(text.length() < token.length())
67 return (text.compare(0, token.length(), token) == 0);
71 * This function returns true if the file exists on the local file
76 static bool fexists(const char *filename)
78 std::ifstream ifile(filename);
79 if ((bool)ifile && ifile.good()) {
86 // ====================================================================
87 // Constructor/Destructor
88 // ====================================================================
90 MeshJobManager_i::MeshJobManager_i(CORBA::ORB_ptr orb,
91 PortableServer::POA_ptr poa,
92 PortableServer::ObjectId * contId,
93 const char *instanceName,
94 const char *interfaceName)
95 : Engines_Component_i(orb, poa, contId, instanceName, interfaceName)
97 LOG("Activating MESHJOB::MeshJobManager object");
99 _id = _poa->activate_object(_thisObj);
101 _salomeLauncher = KERNEL::getSalomeLauncher();
102 if(CORBA::is_nil(_salomeLauncher)){
103 LOG("The SALOME launcher can't be reached ==> STOP");
104 throw KERNEL::createSalomeException("SALOME launcher can't be reached");
107 _resourcesManager = KERNEL::getResourcesManager();
108 if(CORBA::is_nil(_resourcesManager)){
109 LOG("The SALOME resource manager can't be reached ==> STOP");
110 throw KERNEL::createSalomeException("The SALOME resource manager can't be reached");
113 _lastErrorMessage = "";
116 MeshJobManager_i::~MeshJobManager_i() {
117 LOG("MeshJobManager_i::~MeshJobManager_i()");
121 // ====================================================================
122 // Helper functions to deals with the local and remote file systems
123 // ====================================================================
125 #include <fstream> // to get the file streams
127 #include <stdlib.h> // to get _splitpath
128 #include <direct.h> // to get _mkdir
130 #include <unistd.h> // to get basename
131 #include <sys/stat.h> // to get mkdir
132 #include <sys/types.h> // to get mkdir options
135 #include <stdlib.h> // to get system and getenv
137 static std::string OUTPUTFILE("output.med");
138 static std::string DATAFILE("data.txt");
139 static std::string SCRIPTFILE("padder.sh");
140 static std::string SEPARATOR(" ");
142 static std::string USER(getenv("USER"));
143 static std::string LOCAL_INPUTDIR("/tmp/spadder.local.inputdir."+USER);
144 static std::string LOCAL_RESULTDIR("/tmp/spadder.local.resultdir."+USER);
145 static std::string REMOTE_WORKDIR("/tmp/spadder.remote.workdir."+USER);
148 * This function creates the padder text input file containing the
149 * input data (list of filenames and groupnames) and returns the path
150 * of the created file. This function is the one that knows the format
151 * of the padder input file. If the input file format changes, then
152 * this function (and only this one) should be updated. The file
153 * format is the following ([] means that the variable is optional):
155 * [<concreteMeshFile> <concreteGroupName>]
156 * nbSteelBarMeshes <N>
157 * <steelBarMeshFile_1> <steelBarGroupName_1>
158 * <steelBarMeshFile_2> <steelBarGroupName_2>
160 * <steelBarMeshFile_N> <steelBarGroupName_N>
163 const char * MeshJobManager_i::_writeDataFile(std::vector<MESHJOB::MeshJobParameter> listConcreteMesh,
164 std::vector<MESHJOB::MeshJobParameter> listSteelBarMesh) {
166 _mkdir(LOCAL_INPUTDIR.c_str());
168 mkdir(LOCAL_INPUTDIR.c_str(), S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH);
171 // Make it static so that it's allocated once (constant name)
172 static std::string * dataFilename = new std::string(LOCAL_INPUTDIR+"/"+DATAFILE);
173 std::ofstream dataFile(dataFilename->c_str());
175 // Note that we use here the basename of the files because the files
176 // are supposed to be copied in the REMOTE_WORKDIR for execution.
179 // We first specify the concrete mesh data (filename and groupname)
180 if ( listConcreteMesh.size() > 0 ) {
182 char fname[ _MAX_FNAME ];
183 _splitpath( listConcreteMesh[0].file_name, NULL, NULL, fname, NULL );
184 char* bname = &fname[0];
186 char* bname = basename(listConcreteMesh[0].file_name);
188 line = std::string(bname) + " " + std::string(listConcreteMesh[0].group_name);
189 dataFile << line.c_str() << std::endl;
191 // Then, we can specify the steelbar mesh data, starting by the
193 int nbSteelBarMeshes=listSteelBarMesh.size();
194 line = std::string("nbSteelBarMeshes") + SEPARATOR + ToString(nbSteelBarMeshes);
195 dataFile << line.c_str() << std::endl;
196 for (int i=0; i<nbSteelBarMeshes; i++) {
198 char fname[ _MAX_FNAME ];
199 _splitpath( listSteelBarMesh[i].file_name, NULL, NULL, fname, NULL );
200 char* bname = &fname[0];
202 char* bname = basename(listSteelBarMesh[i].file_name);
204 line = std::string(bname) + " " + std::string(listSteelBarMesh[i].group_name);
205 dataFile << line.c_str() << std::endl;
208 // Finally, we conclude with the name of the output file
210 dataFile << line.c_str() << std::endl;
212 return dataFilename->c_str();
216 * This function creates a shell script that runs padder whith the
217 * specified data file, and returns the path of the created script
218 * file. The config id is used to retrieve the path to the binary file
219 * and other required files.
221 const char* MeshJobManager_i::_writeScriptFile(const char * dataFileName, const char * configId) {
223 _mkdir(LOCAL_INPUTDIR.c_str());
225 mkdir(LOCAL_INPUTDIR.c_str(), S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH);
228 // Make it static so that it's allocated once (constant name)
229 static std::string * scriptFilename = new std::string(LOCAL_INPUTDIR+"/"+SCRIPTFILE);
231 char * binpath = _configMap[configId].binpath;
232 char * envpath = _configMap[configId].envpath;
235 char fname[ _MAX_FNAME ];
236 _splitpath( dataFileName, NULL, NULL, fname, NULL );
237 const char* bname = &fname[0];
239 const char* bname = basename(dataFileName);
243 std::ofstream script(scriptFilename->c_str());
244 script << "#!/bin/sh" << std::endl;
245 script << "here=$(dirname $0)" << std::endl;
246 script << ". " << envpath << std::endl;
247 script << binpath << " $here/" << bname << std::endl;
248 // Note that we use the basename of the datafile because all data
249 // files are supposed to have been copied in the REMOTE_WORKDIR.
251 return scriptFilename->c_str();
255 // ====================================================================
256 // Functions to initialize and supervise the mesh computation job
257 // ====================================================================
259 bool MeshJobManager_i::configure(const char *configId,
260 const MESHJOB::ConfigParameter & configParameter)
262 beginService("MeshJobManager_i::configure");
264 _configMap[configId] = configParameter;
266 LOG("Adding configuration for " << configId);
267 LOG("- binpath = " << _configMap[configId].binpath);
268 LOG("- envpath = " << _configMap[configId].envpath);
270 endService("MeshJobManager_i::configure");
274 long MeshJobManager_i::JOBID_UNDEFINED = -1;
276 /*! Initialize a smesh computation job and return the job identifier */
277 CORBA::Long MeshJobManager_i::initialize(const MESHJOB::MeshJobParameterList & meshJobParameterList,
278 const char * configId)
280 beginService("MeshJobManager_i::initialize");
282 // We first analyse the CORBA sequence to store data in C++ vectors
284 std::vector<MESHJOB::MeshJobParameter> listConcreteMesh;
285 std::vector<MESHJOB::MeshJobParameter> listSteelBarMesh;
286 for(CORBA::ULong i=0; i<meshJobParameterList.length(); i++) {
287 MESHJOB::MeshJobParameter currentMesh = meshJobParameterList[i];
288 switch ( currentMesh.file_type ) {
289 case MESHJOB::MED_CONCRETE:
290 listConcreteMesh.push_back(currentMesh);
292 case MESHJOB::MED_STEELBAR:
293 listSteelBarMesh.push_back(currentMesh);
297 std::string("The type of the file ")+
298 std::string(currentMesh.file_name)+
299 std::string(" is not recognized");
300 LOG(_lastErrorMessage);
301 return JOBID_UNDEFINED;
305 // It is not possible to specify more than one concrete
306 // file. Converselly, it is possible to specify no concrete file.
307 if ( listConcreteMesh.size() > 1 ) {
308 // Not consistent with the specification
309 _lastErrorMessage = std::string("You specify more than one concrete mesh (not authorized)");
310 LOG(_lastErrorMessage);
311 return JOBID_UNDEFINED;
314 LOG("Nb. concrete mesh = " << listConcreteMesh.size());
315 LOG("Nb. steelbar mesh = " << listSteelBarMesh.size());
317 // We initiate here a datetime to tag the files and folder
318 // associated to this job.
320 DWORD jobDatetimeTag = timeGetTime();
322 long jobDatetimeTag = timetag();
324 // And a MESHJOB::MeshJobPaths structure to hold the directories
325 // where to find data
326 MESHJOB::MeshJobPaths * jobPaths = new MESHJOB::MeshJobPaths();
327 jobPaths->local_inputdir = LOCAL_INPUTDIR.c_str();
328 jobPaths->local_resultdir = (LOCAL_RESULTDIR + "." + ToString(jobDatetimeTag)).c_str();
329 jobPaths->remote_workdir = (REMOTE_WORKDIR + "." + ToString(jobDatetimeTag)).c_str();
332 // Then, we have to create the padder input data file. This input
333 // data is a text file containing the list of file names and group
336 const char * dataFilename = this->_writeDataFile(listConcreteMesh, listSteelBarMesh);
337 LOG("dataFilename = " << dataFilename);
338 const char * scriptFilename = this->_writeScriptFile(dataFilename, configId);
339 LOG("scriptFilename = " << scriptFilename);
342 // Then, the following instructions consists in preparing the job
343 // parameters to request the SALOME launcher for creating a new
346 Engines::JobParameters_var jobParameters = new Engines::JobParameters;
347 jobParameters->job_type = CORBA::string_dup("command");
348 // CAUTION: the job_file must be a single filename specifying a
349 // self-consistent script to be executed without any argument on the
351 jobParameters->job_file = CORBA::string_dup(scriptFilename);
354 // Specification of the working spaces:
356 // - local_directory: can be used to specify where to find the input
357 // files on the local resource. It's optionnal if you specify the
358 // absolute path name of input files.
360 // - result_directory: must be used to specify where to download the
361 // output files on the local resources
363 // - work_directory: must be used to specify the remote directory
364 // where to put all the stuff to run the job. Note that the job
365 // will be executed from within this directory, i.e. a change
366 // directory toward this working directory is done by the batch
367 // system before running the specified job script.
369 jobParameters->local_directory = CORBA::string_dup("");
370 jobParameters->result_directory = CORBA::string_dup(jobPaths->local_resultdir);
371 jobParameters->work_directory = CORBA::string_dup(jobPaths->remote_workdir);
373 // We specify the input files that are required to execute the
374 // job_file. If basenames are specified, then the files are supposed
375 // to be located in local_directory.
376 int nbcmesh = listConcreteMesh.size();
377 int nbsmesh = listSteelBarMesh.size();
378 int nbFiles = nbsmesh+nbcmesh+1;
379 // The number of input file is:
380 // (nb. of steelbar meshfile)
381 // + (1 or 0 concrete meshfile)
382 // + (1 padder input file)
383 jobParameters->in_files.length(nbFiles);
384 for (int i=0; i<nbcmesh; i++) {
385 jobParameters->in_files[i] = CORBA::string_dup(listConcreteMesh[i].file_name);
387 for (int i=0; i<nbsmesh; i++) {
388 jobParameters->in_files[nbcmesh+i] = CORBA::string_dup(listSteelBarMesh[i].file_name);
390 jobParameters->in_files[nbcmesh+nbsmesh] = CORBA::string_dup(dataFilename);
391 // Note that all these input files will be copied in the
392 // REMOTE_WORKDIR on the remote host. At this step, they should
393 // all exist, so we can check their presence on the local
395 for (int i=0; i<nbFiles; i++) {
396 if ( fexists(jobParameters->in_files[i]) != true ) {
397 _lastErrorMessage = std::string("The input file ") + std::string(jobParameters->in_files[i]);
398 _lastErrorMessage+= std::string(" does not exists. Can't initialize the job");
399 LOG(_lastErrorMessage);
400 return JOBID_UNDEFINED;
404 // Then, we have to specify the existance of an output filename. The
405 // path is supposed to be a path on the remote resource, i.e. where
406 // the job is executed.
407 jobParameters->out_files.length(1);
408 std::string outputfile_name = std::string(jobPaths->remote_workdir)+"/"+OUTPUTFILE;
409 jobParameters->out_files[0] = CORBA::string_dup(outputfile_name.c_str());
411 // CAUTION: the maximum duration has to be set with a format like "hh:mm"
412 jobParameters->maximum_duration = CORBA::string_dup("01:00");
413 jobParameters->queue = CORBA::string_dup("");
415 // Setting resource and additionnal properties (if needed)
416 // The resource parameters can be initiated from scratch, for
417 // example by specifying the values in hard coding:
419 //jobParameters->resource_required.name = CORBA::string_dup("localhost");
420 //jobParameters->resource_required.hostname = CORBA::string_dup("localhost");
421 //jobParameters->resource_required.mem_mb = 1024 * 10;
422 //jobParameters->resource_required.nb_proc = 1;
424 // But it's better to initiate these parameters from a resource
425 // definition known by the resource manager. This ensures that the
426 // resource will be available:
427 //const char * resourceName = "localhost";
428 //const char * resourceName = "boulant@claui2p1";
429 //const char * resourceName = "nepal@nepal";
430 const char * resourceName = _configMap[configId].resname;
432 Engines::ResourceDefinition * resourceDefinition;
434 resourceDefinition = _resourcesManager->GetResourceDefinition(resourceName);
436 catch (const CORBA::SystemException& ex) {
437 _lastErrorMessage = std::string("We can not access to the ressource ") + std::string(resourceName);
438 _lastErrorMessage+= std::string("(check the file CatalogResource.xml)");
439 LOG(_lastErrorMessage);
440 return JOBID_UNDEFINED;
442 // CAUTION: This resource should have been defined in the
443 // CatalogResource.xml associated to the SALOME application.
445 // Then, the values can be used to initiate the resource parameters
447 jobParameters->resource_required.name = CORBA::string_dup(resourceDefinition->name.in());
448 // CAUTION: the additionnal two following parameters MUST be
449 // specified explicitly, because they are not provided by the
450 // resource definition:
451 jobParameters->resource_required.mem_mb = resourceDefinition->mem_mb;
452 jobParameters->resource_required.nb_proc = resourceDefinition->nb_proc_per_node;
453 // CAUTION: the parameter mem_mb specifies the maximum memory value
454 // that could be allocated for executing the job. This takes into
455 // account not only the data that could be loaded by the batch
456 // process but also the linked dynamic library.
458 // A possible problem, for exemple in the case where you use the ssh
459 // emulation of a batch system, is to get an error message as below
460 // when libBatch try to run the ssh command:
462 // ## /usr/bin/ssh: error while loading shared libraries: libcrypto.so.0.9.8: failed
463 // ## to map segment from shared object: Cannot allocate memory
465 // In this exemple, the mem_mb was set to 1MB, value that is not
466 // sufficient to load the dynamic libraries linked to the ssh
467 // executable (libcrypto.so in the error message).
469 // So, even in the case of a simple test shell script, you should
470 // set this value at least to a standard threshold as 500MB
471 int jobId = JOBID_UNDEFINED;
473 jobId = _salomeLauncher->createJob(jobParameters);
474 // We register the datetime tag of this job
475 _jobDateTimeMap[jobId]=jobDatetimeTag;
476 _jobPathsMap[jobId] = jobPaths;
478 catch (const SALOME::SALOME_Exception & ex) {
479 LOG("SALOME Exception at initialization step !" <<ex.details.text.in());
480 _lastErrorMessage = ex.details.text.in();
481 return JOBID_UNDEFINED;
483 catch (const CORBA::SystemException& ex) {
484 LOG("Receive SALOME System Exception: "<<ex);
485 LOG("Check SALOME servers...");
486 _lastErrorMessage = "Check the SALOME servers (or try to restart SALOME)";
487 return JOBID_UNDEFINED;
490 endService("MeshJobManager_i::initialize");
494 /*! Submit the job execution and return true if submission is OK */
495 bool MeshJobManager_i::start(CORBA::Long jobId) {
496 beginService("MeshJobManager_i::start");
499 _salomeLauncher->launchJob(jobId);
501 catch (const SALOME::SALOME_Exception & ex) {
502 LOG("SALOME Exception in launchjob !" <<ex.details.text.in());
503 _lastErrorMessage = ex.details.text.in();
506 catch (const CORBA::SystemException& ex) {
507 LOG("Receive SALOME System Exception: "<<ex);
508 LOG("Check SALOME servers...");
509 _lastErrorMessage = "Check the SALOME servers (or try to restart SALOME)";
513 endService("MeshJobManager_i::initialize");
517 /*! Request the launch manager for the state of the specified job */
518 char* MeshJobManager_i::getState(CORBA::Long jobId) {
519 beginService("MeshJobManager_i::getState");
524 state = _salomeLauncher->getJobState(jobId);
526 catch (const SALOME::SALOME_Exception & ex)
528 LOG("SALOME Exception in getJobState !");
529 _lastErrorMessage = ex.details.text.in();
530 state = ex.details.text;
532 catch (const CORBA::SystemException& ex)
534 LOG("Receive SALOME System Exception: " << ex);
535 state="SALOME System Exception - see logs";
537 LOG("jobId="<<ToString(jobId)<<" state="<<state);
538 endService("MeshJobManager_i::getState");
539 return CORBA::string_dup(state.c_str());
542 MESHJOB::MeshJobPaths * MeshJobManager_i::getPaths(CORBA::Long jobId) {
544 MESHJOB::MeshJobPaths * jobPaths = _jobPathsMap[jobId];
545 if ( jobPaths == NULL ) {
546 LOG("You request the working paths for an undefined job (jobId="<<ToString(jobId)<<")");
547 return NULL; // Maybe raise an exception?
553 MESHJOB::MeshJobResults * MeshJobManager_i::finalize(CORBA::Long jobId) {
554 beginService("MeshJobManager_i::getResults");
555 MESHJOB::MeshJobResults * result = new MESHJOB::MeshJobResults();
557 MESHJOB::MeshJobPaths * jobPaths = this->getPaths(jobId);
558 std::string local_resultdir(jobPaths->local_resultdir);
559 result->results_dirname = local_resultdir.c_str();
562 _salomeLauncher->getJobResults(jobId, local_resultdir.c_str());
564 // __BUG__: to prevent from a bug of the MED driver (SALOME
565 // 5.1.5), we change the basename of the output file to force the
566 // complete reloading of data by the med driver.
567 long jobDatetimeTag = _jobDateTimeMap[jobId];
568 std::string outputFileName = "output"+ToString(jobDatetimeTag)+".med";
569 rename((local_resultdir+"/"+OUTPUTFILE).c_str(), (local_resultdir+"/"+outputFileName).c_str());
571 result->outputmesh_filename = outputFileName.c_str();
572 result->status = "OK";
574 catch (const SALOME::SALOME_Exception & ex)
576 LOG("SALOME Exception in getResults !");
577 result->status = "SALOME Exception in getResults !";
578 _lastErrorMessage = ex.details.text.in();
580 catch (const CORBA::SystemException& ex)
582 LOG("Receive CORBA System Exception: " << ex);
583 result->status = "Receive CORBA System Exception: see log";
585 endService("MeshJobManager_i::getResults");
590 /*! Clean all data associated to this job and remove the job from the launch manager */
591 bool MeshJobManager_i::clean(CORBA::Long jobId) {
592 beginService("MeshJobManager_i::clean");
594 // __GBO__ WORK IN PROGRESS: we just clean the temporary local
595 // directories. The remote working directories are tag with the
596 // execution datetime and the we prevent the task from conflict
597 // with files of another task.
598 MESHJOB::MeshJobPaths * jobPaths = this->getPaths(jobId);
599 if ( jobPaths == NULL ) return false;
602 // For safety reason (and prevent from bug that could erase the
603 // filesystem), we cancel the operation in the case where the
604 // directories to delete are not in the /tmp folder.
605 std::string shell_command("rm -rf ");
606 std::string inputdir(jobPaths->local_inputdir);
607 std::string resultdir(jobPaths->local_resultdir);
608 if ( !myStartsWith(inputdir,"/tmp/") ) {
609 LOG("WRN: The directory "<<inputdir<<" is not in /tmp. NO DELETE is done");
611 shell_command+=inputdir+" ";
613 if ( !myStartsWith(resultdir,"/tmp/")) {
614 LOG("WRN: The directory "<<resultdir<<" is not in /tmp. NO DELETE is done");
616 shell_command+=resultdir;
619 LOG("DBG: clean shell command = "<<shell_command);
621 bool cleanOk = false;
622 int error = system(shell_command.c_str());
623 if (error == 0) cleanOk = true;
625 endService("MeshJobManager_i::clean");
630 std::vector<std::string> * MeshJobManager_i::_getResourceNames() {
633 // These part is just to control the available resources
635 Engines::ResourceParameters params;
636 KERNEL::getLifeCycleCORBA()->preSet(params);
638 Engines::ResourceList * resourceList = _resourcesManager->GetFittingResources(params);
639 Engines::ResourceDefinition * resourceDefinition = NULL;
640 LOG("### resource list:");
641 std::vector<std::string>* resourceNames = new std::vector<std::string>();
643 for (int i = 0; i < resourceList->length(); i++) {
644 const char* aResourceName = (*resourceList)[i];
645 resourceNames->push_back(std::string(aResourceName));
646 LOG("resource["<<i<<"] = "<<aResourceName);
647 resourceDefinition = _resourcesManager->GetResourceDefinition(aResourceName);
648 LOG("protocol["<<i<<"] = "<<resourceDefinition->protocol);
652 // Note: a ResourceDefinition is used to create a batch configuration
653 // in the Launcher. This operation is done at Launcher startup from
654 // the configuration file CatalogResources.xml provided by the
655 // SALOME application.
656 // In the code instructions, you just have to choose a resource
657 // configuration by its name and then define the ResourceParameters
658 // that specify additionnal properties for a specific job submission
659 // (use the attribute resource_required of the JobParameters).
661 return resourceNames;
664 char* MeshJobManager_i::getLastErrorMessage() {
665 beginService("MeshJobManager_i::getState");
666 endService("MeshJobManager_i::getState");
667 return CORBA::string_dup(_lastErrorMessage.c_str());
671 // ==========================================================================
673 // ==========================================================================
677 PortableServer::ObjectId * MeshJobManagerEngine_factory( CORBA::ORB_ptr orb,
678 PortableServer::POA_ptr poa,
679 PortableServer::ObjectId * contId,
680 const char *instanceName,
681 const char *interfaceName)
683 LOG("PortableServer::ObjectId * MeshJobManagerEngine_factory()");
684 MeshJobManager_i * myEngine = new MeshJobManager_i(orb, poa, contId, instanceName, interfaceName);
685 return myEngine->getId() ;