From cb153cc399864bd62ef16978930baf48d063d8dc Mon Sep 17 00:00:00 2001 From: ribes Date: Thu, 26 Nov 2009 13:50:34 +0000 Subject: [PATCH] - Add nb_proc to MachineParameters - Adding $HOME to default work_directory - RemoveJob fixed --- idl/SALOME_ContainerManager.idl | 2 + src/Container/SALOME_ContainerManager.cxx | 1 + src/Launcher/Launcher.cxx | 5 +- src/Launcher/Launcher_Job.cxx | 22 +- src/Launcher/Launcher_Job.hxx | 1 + src/Launcher/SALOME_Launcher.cxx | 1 + src/LifeCycleCORBA/LifeCycleCORBA.py | 275 ------------------ src/LifeCycleCORBA/SALOME_LifeCycleCORBA.cxx | 2 + src/LifeCycleCORBA_SWIG/LifeCycleCORBA.py | 2 +- .../libSALOME_LifeCycleCORBA.i | 4 + src/ResourcesManager/ResourcesManager.cxx | 2 + src/ResourcesManager/ResourcesManager.hxx | 1 + .../SALOME_ResourcesCatalog_Parser.cxx | 14 + .../SALOME_ResourcesCatalog_Parser.hxx | 1 + .../SALOME_ResourcesManager.cxx | 1 + 15 files changed, 50 insertions(+), 284 deletions(-) delete mode 100644 src/LifeCycleCORBA/LifeCycleCORBA.py diff --git a/idl/SALOME_ContainerManager.idl b/idl/SALOME_ContainerManager.idl index ebd2c47f9..6108e22c2 100644 --- a/idl/SALOME_ContainerManager.idl +++ b/idl/SALOME_ContainerManager.idl @@ -53,6 +53,8 @@ struct MachineParameters MachineList computerList; //! required operating system string OS; + //! required number of proc + long nb_proc; //! required memory size long mem_mb; //! required frequency diff --git a/src/Container/SALOME_ContainerManager.cxx b/src/Container/SALOME_ContainerManager.cxx index 8afbc5f2f..6b56ef876 100644 --- a/src/Container/SALOME_ContainerManager.cxx +++ b/src/Container/SALOME_ContainerManager.cxx @@ -1364,6 +1364,7 @@ SALOME_ContainerManager::BuildCommandToLaunchParallelContainer(const std::string rtn->cpu_clock = params.cpu_clock; rtn->nb_proc_per_node = params.nb_proc_per_node; rtn->nb_node = params.nb_node; + rtn->nb_proc = params.nb_proc; rtn->isMPI = params.isMPI; // Step 1 : local or remote launch ? diff --git a/src/Launcher/Launcher.cxx b/src/Launcher/Launcher.cxx index c77c680b2..812cfedfc 100644 --- a/src/Launcher/Launcher.cxx +++ b/src/Launcher/Launcher.cxx @@ -290,7 +290,8 @@ Launcher_cpp::removeJob(int job_id) throw LauncherException("Cannot find the job, is it created ?"); } - _launcher_job_map.erase(it_job); // Erase call delete on it_job->second + delete it_job->second; + _launcher_job_map.erase(it_job); } //============================================================================= @@ -322,7 +323,7 @@ Launcher_cpp::createJobWithFile(const std::string xmlExecuteFile, machineParams p; p.hostname = clusterName; - p.nb_node = job_params.NbOfProcesses; + p.nb_proc = job_params.NbOfProcesses; new_job->setMachineRequiredParams(p); createJob(new_job); diff --git a/src/Launcher/Launcher_Job.cxx b/src/Launcher/Launcher_Job.cxx index ba645a694..da1f5c2aa 100644 --- a/src/Launcher/Launcher_Job.cxx +++ b/src/Launcher/Launcher_Job.cxx @@ -37,6 +37,7 @@ Launcher::Job::Job() _maximum_duration_in_second = -1; _machine_required_params.hostname = ""; _machine_required_params.OS = ""; + _machine_required_params.nb_proc = -1; _machine_required_params.nb_node = -1; _machine_required_params.nb_proc_per_node = -1; _machine_required_params.cpu_clock = -1; @@ -55,7 +56,16 @@ Launcher::Job::~Job() LAUNCHER_MESSAGE("Deleting job number: " << _number); #ifdef WITH_LIBBATCH if (_batch_job_id.getReference() != "undefined") - _batch_job_id.deleteJob(); + { + try + { + _batch_job_id.deleteJob(); + } + catch (const Batch::EmulationException &ex) + { + LAUNCHER_INFOS("WARNING: exception when deleting the job: " << ex.message); + } + } if (_batch_job) delete _batch_job; #endif @@ -303,10 +313,10 @@ Launcher::Job::checkMaximumDuration(const std::string & maximum_duration) void Launcher::Job::checkMachineRequiredParams(const machineParams & machine_required_params) { - // nb_node has be to > 0 - if (machine_required_params.nb_node <= 0) + // nb_proc has be to > 0 + if (machine_required_params.nb_proc <= 0) { - std::string message("[Launcher::Job::checkMachineRequiredParams] node number is not >0 ! "); + std::string message("[Launcher::Job::checkMachineRequiredParams] proc number is not > 0 ! "); throw LauncherException(message); } } @@ -393,7 +403,7 @@ Launcher::Job::common_job_params() Batch::Parametre params; params[USER] = _machine_definition.UserName; - params[NBPROC] = _machine_required_params.nb_node; + params[NBPROC] = _machine_required_params.nb_proc; // Memory if (_machine_required_params.mem_mb > 0) @@ -416,7 +426,7 @@ Launcher::Job::common_job_params() } i++ ; } - _work_directory = std::string("Batch/"); + _work_directory = std::string("$HOME/Batch/"); _work_directory += thedate; } params[WORKDIR] = _work_directory; diff --git a/src/Launcher/Launcher_Job.hxx b/src/Launcher/Launcher_Job.hxx index ec2938313..12686a5be 100644 --- a/src/Launcher/Launcher_Job.hxx +++ b/src/Launcher/Launcher_Job.hxx @@ -39,6 +39,7 @@ #include #include #include +#include #endif namespace Launcher diff --git a/src/Launcher/SALOME_Launcher.cxx b/src/Launcher/SALOME_Launcher.cxx index 4459c76fa..52e19abb3 100644 --- a/src/Launcher/SALOME_Launcher.cxx +++ b/src/Launcher/SALOME_Launcher.cxx @@ -152,6 +152,7 @@ SALOME_Launcher::createJob(const Engines::JobParameters & job_parameters) machineParams p; p.hostname = job_parameters.resource_required.hostname; p.OS = job_parameters.resource_required.OS; + p.nb_proc = job_parameters.resource_required.nb_proc; p.nb_node = job_parameters.resource_required.nb_node; p.nb_proc_per_node = job_parameters.resource_required.nb_proc_per_node; p.cpu_clock = job_parameters.resource_required.cpu_clock; diff --git a/src/LifeCycleCORBA/LifeCycleCORBA.py b/src/LifeCycleCORBA/LifeCycleCORBA.py deleted file mode 100644 index 133ab610c..000000000 --- a/src/LifeCycleCORBA/LifeCycleCORBA.py +++ /dev/null @@ -1,275 +0,0 @@ -# -*- coding: iso-8859-1 -*- -# Copyright (C) 2007-2008 CEA/DEN, EDF R&D, OPEN CASCADE -# -# Copyright (C) 2003-2007 OPEN CASCADE, EADS/CCR, LIP6, CEA/DEN, -# CEDRAT, EDF R&D, LEG, PRINCIPIA R&D, BUREAU VERITAS -# -# This library is free software; you can redistribute it and/or -# modify it under the terms of the GNU Lesser General Public -# License as published by the Free Software Foundation; either -# version 2.1 of the License. -# -# This library is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -# Lesser General Public License for more details. -# -# You should have received a copy of the GNU Lesser General Public -# License along with this library; if not, write to the Free Software -# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -# -# See http://www.salome-platform.org/ or email : webmaster.salome@opencascade.com -# -# SALOME LifeCycleC RBA : implementation of containers and engines life cycle both in Python and C++ -# File : LifeCycleCORBA.py -# Author : Paul RASCLE, EDF -# Module : SALOME -# $Header$ -# -import os -import sys -import time -import string -from omniORB import CORBA -import CosNaming -import Engines -reload(Engines) -import SALOME_ModuleCatalog - -from SALOME_utilities import * -from Utils_Identity import getShortHostName -import Utils_Identity -import Launchers - -class LifeCycleCORBA: - _orb = None - _rootcontext = None - _containerRootContext = None - _catalog = None - - #------------------------------------------------------------------------- - - def __init__(self, orb): - MESSAGE( "LifeCycleCORBA::__init__" ) - self._orb = orb - - obj = self._orb.resolve_initial_references("NameService") - self._rootContext = obj._narrow(CosNaming.NamingContext) - - if self._rootContext is None: - MESSAGE( "Name Service Reference is invalid" ) - - name = [CosNaming.NameComponent("Containers","dir")] - try: - self._containerRootContext = self._rootContext.bind_new_context(name) - - except CosNaming.NamingContext.AlreadyBound, ex: - MESSAGE( "/Containers.dir Context already exists" ) - obj = self._rootContext.resolve(name) - self._containerRootContext = obj._narrow(CosNaming.NamingContext) - if self._containerRootContext is None: - MESSAGE( "Containers.dir exists but it is not a NamingContext" ) - - name = [CosNaming.NameComponent("Kernel","dir"), - CosNaming.NameComponent("ModulCatalog","object")] - try: - obj = self._rootContext.resolve(name) - except CosNaming.NamingContext.NotFound, ex: - MESSAGE( "/Kernel.dir/ModulCatalog.object not found in Naming Service" ) - - self._catalog = obj._narrow(SALOME_ModuleCatalog.ModuleCatalog) - if self._catalog is None: - MESSAGE( "/Kernel.dir/ModulCatalog.object exists but is not a ModulCatalog" ) - - name = [CosNaming.NameComponent("ContainerManager","object")] - try: - obj = self._rootContext.resolve(name) - except CosNaming.NamingContext.NotFound, ex: - MESSAGE( "ContainerManager.object not found in Naming Service" ) - self._contManager = obj._narrow(Engines.ContainerManager) - if self._contManager is None: - MESSAGE( "ContainerManager.object exists but is not a ContainerManager") - - #------------------------------------------------------------------------- - - def ContainerName(self, containerName): - theComputer = "" - try: - theComputer , theContainer = containerName.split('/') - except: - theComputer = "" - theContainer = containerName - - if theComputer in ("","localhost") : - theComputer = getShortHostName() - - MESSAGE( theComputer + theContainer ) - return theComputer,theContainer - - #------------------------------------------------------------------------- - - def ComputerPath(self, ComputerName ): - try: - #path = self._catalog.GetPathPrefix( ComputerName ) - path = os.getenv("KERNEL_ROOT_DIR") + "/bin/salome/" - except SALOME_ModuleCatalog.NotFound, ex: - path = "" - return path - - #------------------------------------------------------------------------- - - def FindContainer(self, containerName): - theComputer,theContainer = self.ContainerName( containerName ) - name = [CosNaming.NameComponent(theComputer,"dir"), - CosNaming.NameComponent(theContainer,"object")] - obj = None - try: - obj = self._containerRootContext.resolve(name) - MESSAGE( containerName + ".object found in Naming Service" ) - - except CosNaming.NamingContext.NotFound, ex: - MESSAGE( containerName + ".object not found in Naming Service" ) - - if obj is None: - container = None - else: - container = obj._narrow(Engines.Container) - if container is None: - MESSAGE( containerName + ".object exists but is not a Container" ) - return container - - #------------------------------------------------------------------------- - - def FindComponent(self,containerName,componentName,listOfMachines): - if containerName!="": - machinesOK=[] - for i in range(len(listOfMachines)): - currentMachine=listOfMachines[i] - componentNameForNS= [CosNaming.NameComponent(currentMachine,"dir"), - CosNaming.NameComponent(containerName,"dir"), - CosNaming.NameComponent(componentName,"object")] - obj=None - try: - obj = self._containerRootContext.resolve(componentNameForNS) - except CosNaming.NamingContext.NotFound, ex: - MESSAGE( "component " + componentName + " not found on machine " + currentMachine + " , trying to load" ) - pass - if obj is not None: - machinesOK.append(currentMachine) - pass - pass - if len(machinesOK)!=0: - bestMachine=self._contManager.FindFirst(machinesOK) - componentNameForNS= [CosNaming.NameComponent(bestMachine,"dir"), - CosNaming.NameComponent(containerName,"dir"), - CosNaming.NameComponent(componentName,"object")] - obj=None - try: - obj = self._containerRootContext.resolve(componentNameForNS) - except: - pass - if obj is not None: - return obj._narrow(Engines.Component) - else: - MESSAGE( "Big problem !!!") - return None - else: - return None - else: - bestMachine=self._contManager.FindFirst(listOfMachines) - MESSAGE("Not implemented yet ...") - return None - pass - - #------------------------------------------------------------------------- - - def setLauncher(self,name): - """Change default launcher to the launcher identified by name - - See module Launchers.py - """ - Launchers.setLauncher(name) - - #------------------------------------------------------------------------- - - def StartContainer(self, theComputer , theContainer ): - """Start a container on theComputer machine with theContainer name - """ - # Get the Naming Service address - # - addr=self._orb.object_to_string(self._rootContext) - # - # If container name contains "Py" launch a Python Container - # - if theContainer.find('Py') == -1 : - CMD=['SALOME_Container',theContainer,'-ORBInitRef','NameService='+addr] - else: - CMD=['SALOME_ContainerPy.py',theContainer,'-ORBInitRef','NameService='+addr] - if theComputer in ("","localhost"): - theComputer=getShortHostName() - # - # Get the appropriate launcher and ask to launch - # - Launchers.getLauncher(theComputer).launch(theComputer,CMD) - # - # Wait until the container is registered in Naming Service - # - count =5 - aContainer=None - while aContainer is None and count > 0: - time.sleep(1) - count = count - 1 - MESSAGE( str(count) + ". Waiting for " + theComputer + "/" + theContainer ) - aContainer = self.FindContainer( theComputer + "/" + theContainer ) - return aContainer - - #------------------------------------------------------------------------- - - def FindOrStartContainer(self, theComputer , theContainer ): - """Find or Start a container on theComputer machine with theContainer name - """ - if theComputer in ("","localhost"): - theComputer=getShortHostName() - MESSAGE( "FindOrStartContainer: " + theComputer + theContainer ) - aContainer = self.FindContainer( theComputer + "/" + theContainer ) - if aContainer is None : - aContainer= self.StartContainer(theComputer , theContainer ) - return aContainer - - #------------------------------------------------------------------------- - - def LoadComponent(self,containerName,componentName,listOfMachine): - container=self._contManager.FindOrStartContainer(containerName,listOfMachine) - implementation="lib"+componentName+"Engine.so" - try: - component = container.load_impl(componentName, implementation) - MESSAGE( "component " + component._get_instanceName() + " launched !" ) - return component - except: - MESSAGE( "component " + componentName + " NOT launched !" ) - return None - - #------------------------------------------------------------------------- - - - def FindOrLoadComponent(self, containerName, componentName): - sp=containerName.split("/") - if len(sp)==1: - listOfMachine=[] - listOfMachine.append(getShortHostName()) - comp=self.FindComponent(containerName,componentName,listOfMachine) - if comp is None: - return self.LoadComponent(containerName,componentName,listOfMachine) - else: - return comp - pass - else: - params= Engines.MachineParameters(sp[1],sp[0],"LINUX",0,0,0,0) - listOfMachine=self._contManager.GetFittingResources(params,componentName) - ret=self.FindComponent(sp[1],componentName,listOfMachine); - if ret is None: - return self.LoadComponent(sp[1],componentName,listOfMachine) - else: - return ret - pass - diff --git a/src/LifeCycleCORBA/SALOME_LifeCycleCORBA.cxx b/src/LifeCycleCORBA/SALOME_LifeCycleCORBA.cxx index 534553609..7fe8c0fb9 100644 --- a/src/LifeCycleCORBA/SALOME_LifeCycleCORBA.cxx +++ b/src/LifeCycleCORBA/SALOME_LifeCycleCORBA.cxx @@ -360,6 +360,7 @@ SALOME_LifeCycleCORBA::isMpiContainer(const Engines::MachineParameters& params) * - container_name = "" : not relevant * - hostname = "" : not relevant * - OS = "" : not relevant + * - nb_proc = 0 : not relevant * - mem_mb = 0 : not relevant * - cpu_clock = 0 : not relevant * - nb_proc_per_node = 0 : not relevant @@ -375,6 +376,7 @@ void SALOME_LifeCycleCORBA::preSet( Engines::MachineParameters& params) //param.componentList = 0; //param.computerList = 0; params.OS = ""; + params.nb_proc = 0; params.mem_mb = 0; params.cpu_clock = 0; params.nb_proc_per_node = 0; diff --git a/src/LifeCycleCORBA_SWIG/LifeCycleCORBA.py b/src/LifeCycleCORBA_SWIG/LifeCycleCORBA.py index df7445c42..2a459c6e5 100644 --- a/src/LifeCycleCORBA_SWIG/LifeCycleCORBA.py +++ b/src/LifeCycleCORBA_SWIG/LifeCycleCORBA.py @@ -41,7 +41,7 @@ class LifeCycleCORBA (SALOME_LifeCycleCORBA): class MachineParameters (Engines.MachineParameters): def __init__(self, container_name='', hostname='', componentList=[], computerList=[], OS='', - mem_mb=0, cpu_clock=0, nb_proc_per_node=0, nb_node=0, isMPI=False, workingdir='', + nb_proc=0, mem_mb=0, cpu_clock=0, nb_proc_per_node=0, nb_node=0, isMPI=False, workingdir='', mode='start', policy='altcycl', parallelLib='', nb_component_nodes=0): Engines.MachineParameters.__init__(self,container_name, hostname, componentList, computerList, OS, mem_mb, cpu_clock, nb_proc_per_node, nb_node, isMPI, workingdir, diff --git a/src/LifeCycleCORBA_SWIG/libSALOME_LifeCycleCORBA.i b/src/LifeCycleCORBA_SWIG/libSALOME_LifeCycleCORBA.i index 22da974ba..2caeec2e7 100644 --- a/src/LifeCycleCORBA_SWIG/libSALOME_LifeCycleCORBA.i +++ b/src/LifeCycleCORBA_SWIG/libSALOME_LifeCycleCORBA.i @@ -153,6 +153,10 @@ using namespace std; { param->OS = CORBA::string_dup(PyString_AsString(value)); } + else if (strcmp(keystr,"nb_proc")==0) + { + param->nb_proc = PyLong_AsLong(value); + } else if (strcmp(keystr,"mem_mb")==0) { param->mem_mb = PyLong_AsLong(value); diff --git a/src/ResourcesManager/ResourcesManager.cxx b/src/ResourcesManager/ResourcesManager.cxx index d1707dffd..b5fb0b2df 100644 --- a/src/ResourcesManager/ResourcesManager.cxx +++ b/src/ResourcesManager/ResourcesManager.cxx @@ -253,6 +253,8 @@ ResourcesManager_cpp::GetFittingResources(const machineParams& params) throw(Res SelectOnlyResourcesWithOS(vec, params.OS.c_str()); // --- set wanted parameters + ResourceDataToSort::_nbOfProcWanted = params.nb_proc; + ResourceDataToSort::_nbOfNodesWanted = params.nb_node; ResourceDataToSort::_nbOfProcPerNodeWanted = params.nb_proc_per_node; diff --git a/src/ResourcesManager/ResourcesManager.hxx b/src/ResourcesManager/ResourcesManager.hxx index 9d95ee3f7..c66b71055 100644 --- a/src/ResourcesManager/ResourcesManager.hxx +++ b/src/ResourcesManager/ResourcesManager.hxx @@ -48,6 +48,7 @@ struct machineParams{ std::string hostname; std::string OS; std::string parallelLib; + unsigned int nb_proc; unsigned int nb_node; unsigned int nb_proc_per_node; unsigned int cpu_clock; diff --git a/src/ResourcesManager/SALOME_ResourcesCatalog_Parser.cxx b/src/ResourcesManager/SALOME_ResourcesCatalog_Parser.cxx index 40870f49e..7d9b6b312 100644 --- a/src/ResourcesManager/SALOME_ResourcesCatalog_Parser.cxx +++ b/src/ResourcesManager/SALOME_ResourcesCatalog_Parser.cxx @@ -27,6 +27,7 @@ using namespace std; +unsigned int ResourceDataToSort::_nbOfProcWanted = NULL_VALUE; unsigned int ResourceDataToSort::_nbOfNodesWanted = NULL_VALUE; unsigned int ResourceDataToSort::_nbOfProcPerNodeWanted = NULL_VALUE; unsigned int ResourceDataToSort::_CPUFreqMHzWanted = NULL_VALUE; @@ -57,6 +58,19 @@ bool ResourceDataToSort::operator< (const ResourceDataToSort& other) const unsigned int ResourceDataToSort::GetNumberOfPoints() const { unsigned int ret = 0; + //priority 0 : Nb of proc + + if (_nbOfProcWanted != NULL_VALUE) + { + unsigned int nb_proc = _nbOfNodes * _nbOfProcPerNode; + if (nb_proc == _nbOfProcWanted) + ret += 30000; + else if (nb_proc > _nbOfProcWanted) + ret += 20000; + else + ret += 10000; + } + //priority 1 : Nb of nodes if (_nbOfNodesWanted != NULL_VALUE) diff --git a/src/ResourcesManager/SALOME_ResourcesCatalog_Parser.hxx b/src/ResourcesManager/SALOME_ResourcesCatalog_Parser.hxx index 9c1cf3983..3d00a9185 100755 --- a/src/ResourcesManager/SALOME_ResourcesCatalog_Parser.hxx +++ b/src/ResourcesManager/SALOME_ResourcesCatalog_Parser.hxx @@ -53,6 +53,7 @@ class RESOURCESMANAGER_EXPORT ResourceDataToSort unsigned int _nbOfProcPerNode; unsigned int _CPUFreqMHz; unsigned int _memInMB; + static unsigned int _nbOfProcWanted; static unsigned int _nbOfNodesWanted; static unsigned int _nbOfProcPerNodeWanted; static unsigned int _CPUFreqMHzWanted; diff --git a/src/ResourcesManager/SALOME_ResourcesManager.cxx b/src/ResourcesManager/SALOME_ResourcesManager.cxx index 91d2de5c9..6d4fa049d 100644 --- a/src/ResourcesManager/SALOME_ResourcesManager.cxx +++ b/src/ResourcesManager/SALOME_ResourcesManager.cxx @@ -146,6 +146,7 @@ SALOME_ResourcesManager::GetFittingResources(const Engines::MachineParameters& p machineParams p; p.hostname = params.hostname; p.OS = params.OS; + p.nb_proc = params.nb_proc; p.nb_node = params.nb_node; p.nb_proc_per_node = params.nb_proc_per_node; p.cpu_clock = params.cpu_clock; -- 2.39.2