From a24f0d2e91a08f7a36b488737dbadcc20b6b2268 Mon Sep 17 00:00:00 2001 From: Anthony Geay Date: Wed, 13 Sep 2023 14:42:24 +0200 Subject: [PATCH] [EDF28561] : Set of toolkit to ease performance measures --- idl/SALOME_ContainerManager.idl | 8 ++ src/Container/SALOME_ContainerManager.cxx | 42 ++++++- src/Container/SALOME_ContainerManager.hxx | 11 ++ src/GenericObj/SALOME_GenericObj_i.cc | 5 - src/Launcher/KernelLauncher.cxx | 14 +++ src/Launcher/KernelLauncher.hxx | 1 + src/Launcher/KernelLauncher.i | 1 + src/Launcher/SALOME_Launcher.hxx | 2 + src/Launcher_SWIG/Launcher.i | 142 ++++++++++++++++++++++ src/ResourcesManager/ResourcesManager.cxx | 5 + src/ResourcesManager/ResourcesManager.hxx | 2 + 11 files changed, 222 insertions(+), 11 deletions(-) diff --git a/idl/SALOME_ContainerManager.idl b/idl/SALOME_ContainerManager.idl index f4de98ae1..9251df6ab 100644 --- a/idl/SALOME_ContainerManager.idl +++ b/idl/SALOME_ContainerManager.idl @@ -89,6 +89,14 @@ interface ContainerManager //! Shutdown all containers that have been launched by the container manager void ShutdownContainers(); + long GetTimeOutToLaunchServerInSecond(); + + void SetTimeOutToLaunchServerInSecond(in long timeInSecond); + + long GetDeltaTimeBetweenNSLookupAtLaunchTimeInMilliSecond(); + + void SetDeltaTimeBetweenNSLookupAtLaunchTimeInMilliSecond(in long timeInMS); + void SetOverrideEnvForContainers(in KeyValDict env); KeyValDict GetOverrideEnvForContainers(); diff --git a/src/Container/SALOME_ContainerManager.cxx b/src/Container/SALOME_ContainerManager.cxx index 80940cdba..b8ad494e2 100644 --- a/src/Container/SALOME_ContainerManager.cxx +++ b/src/Container/SALOME_ContainerManager.cxx @@ -41,6 +41,8 @@ #include #include #include +#include +#include #include #include CORBA_CLIENT_HEADER(SALOME_Session) @@ -66,8 +68,9 @@ const int SALOME_ContainerManager::TIME_OUT_TO_LAUNCH_CONT=60; -const char *SALOME_ContainerManager::_ContainerManagerNameInNS = - "/ContainerManager"; +const int SALOME_ContainerManager::DFT_DELTA_TIME_NS_LOOKUP_IN_MS=1000; + +const char *SALOME_ContainerManager::_ContainerManagerNameInNS = "/ContainerManager"; omni_mutex SALOME_ContainerManager::_numInstanceMutex; @@ -85,11 +88,12 @@ Utils_Mutex SALOME_ContainerManager::_systemMutex; //============================================================================= SALOME_ContainerManager::SALOME_ContainerManager(CORBA::ORB_ptr orb, PortableServer::POA_var poa, SALOME_NamingService_Abstract *ns) - : _nbprocUsed(1) + : _nbprocUsed(1),_delta_time_ns_lookup_in_ms(DFT_DELTA_TIME_NS_LOOKUP_IN_MS) { MESSAGE("constructor"); _NS = ns; _resManager = new SALOME_ResourcesManager_Client(ns); + _time_out_in_second = GetTimeOutToLoaunchServer(); PortableServer::POAManager_var pman = poa->the_POAManager(); _orb = CORBA::ORB::_duplicate(orb) ; @@ -203,6 +207,26 @@ void SALOME_ContainerManager::Shutdown() _poa->deactivate_object(oid); } +CORBA::Long SALOME_ContainerManager::GetTimeOutToLaunchServerInSecond() +{ + return this->_time_out_in_second; +} + +void SALOME_ContainerManager::SetTimeOutToLaunchServerInSecond(CORBA::Long timeInSecond) +{ + this->_time_out_in_second = timeInSecond; +} + +CORBA::Long SALOME_ContainerManager::GetDeltaTimeBetweenNSLookupAtLaunchTimeInMilliSecond() +{ + return this->_delta_time_ns_lookup_in_ms; +} + +void SALOME_ContainerManager::SetDeltaTimeBetweenNSLookupAtLaunchTimeInMilliSecond(CORBA::Long timeInMS) +{ + this->_delta_time_ns_lookup_in_ms = timeInMS; +} + //============================================================================= //! Loop on all the containers listed in naming service, ask shutdown on each /*! CORBA Method: @@ -670,11 +694,13 @@ SALOME_ContainerManager::LaunchContainer(const Engines::ContainerParameters& par else { // Step 4: Wait for the container - int count(GetTimeOutToLoaunchServer()); - INFOS("[GiveContainer] waiting " << count << " second steps container " << containerNameInNS); + double nbTurn = ( (double)this->_time_out_in_second ) * ( 1000.0 / ( (double) this->_delta_time_ns_lookup_in_ms) ); + int count( (int)nbTurn ); + INFOS("[GiveContainer] # attempts : " << count << " name in NS : \"" << containerNameInNS << "\""); + INFOS("[GiveContainer] # attempts : Time in second before time out : " << this->_time_out_in_second << " Delta time in ms between NS lookup : " << this->_delta_time_ns_lookup_in_ms); while (CORBA::is_nil(ret) && count) { - SleepInSecond(1); + std::this_thread::sleep_for(std::chrono::milliseconds(_delta_time_ns_lookup_in_ms)); count--; MESSAGE("[GiveContainer] step " << count << " Waiting for container on " << resource_selected << " with entry in NS = \"" << containerNameInNS << "\"" ); CORBA::Object_var obj(_NS->Resolve(containerNameInNS.c_str())); @@ -1154,6 +1180,10 @@ void SALOME_ContainerManager::MakeTheCommandToBeLaunchedASync(std::string& comma #endif } +/*! + * Return in second the time out to give chance to server to be launched and + * to register into NS + */ int SALOME_ContainerManager::GetTimeOutToLoaunchServer() { int count(TIME_OUT_TO_LAUNCH_CONT); diff --git a/src/Container/SALOME_ContainerManager.hxx b/src/Container/SALOME_ContainerManager.hxx index 1e458769f..67a333482 100644 --- a/src/Container/SALOME_ContainerManager.hxx +++ b/src/Container/SALOME_ContainerManager.hxx @@ -61,6 +61,14 @@ public: void DeclareUsingSalomeSession() { _isSSL = false; } + CORBA::Long GetTimeOutToLaunchServerInSecond() override; + + void SetTimeOutToLaunchServerInSecond(CORBA::Long timeInSecond) override; + + CORBA::Long GetDeltaTimeBetweenNSLookupAtLaunchTimeInMilliSecond() override; + + void SetDeltaTimeBetweenNSLookupAtLaunchTimeInMilliSecond(CORBA::Long timeInMS) override; + static const char *_ContainerManagerNameInNS; protected: @@ -197,9 +205,12 @@ public: static void SleepInSecond(int ellapseTimeInSecond); private: static const int TIME_OUT_TO_LAUNCH_CONT; + static const int DFT_DELTA_TIME_NS_LOOKUP_IN_MS; static Utils_Mutex _getenvMutex; static Utils_Mutex _systemMutex; private: std::vector< std::pair > _override_env; + int _time_out_in_second; + int _delta_time_ns_lookup_in_ms; }; #endif diff --git a/src/GenericObj/SALOME_GenericObj_i.cc b/src/GenericObj/SALOME_GenericObj_i.cc index 5d14412b6..5a5c7d888 100644 --- a/src/GenericObj/SALOME_GenericObj_i.cc +++ b/src/GenericObj/SALOME_GenericObj_i.cc @@ -73,8 +73,6 @@ namespace SALOME */ GenericObj_i::GenericObj_i(PortableServer::POA_ptr thePOA): myRefCounter(1) { - MESSAGE("GenericObj_i::GenericObj_i() - this = " << this << - "; CORBA::is_nil(thePOA) = " << CORBA::is_nil(thePOA)); if(CORBA::is_nil(thePOA)) { #ifndef WIN32 @@ -86,8 +84,6 @@ namespace SALOME else { myPOA = PortableServer::POA::_duplicate(thePOA); } - - MESSAGE("GenericObj_i::GenericObj_i thePOA: " << thePOA << " myPOA: " << myPOA); } /*! @@ -100,7 +96,6 @@ namespace SALOME */ PortableServer::POA_ptr GenericObj_i::_default_POA() { - MESSAGE("GenericObj_i::_default_POA: " << myPOA); return PortableServer::POA::_duplicate(myPOA); } diff --git a/src/Launcher/KernelLauncher.cxx b/src/Launcher/KernelLauncher.cxx index d66cb99de..5ed46f83c 100644 --- a/src/Launcher/KernelLauncher.cxx +++ b/src/Launcher/KernelLauncher.cxx @@ -28,6 +28,20 @@ #include "SALOME_CPythonHelper.hxx" #include +#include + +std::string RetrieveInternalInstanceOfLocalCppResourcesManager() +{ + SALOME_Launcher *launcher = KERNEL::getLauncherSA(); + SALOME_ResourcesManager *rm(launcher->getResourcesManager()); + if(rm) + { + std::shared_ptr *ret1(new std::shared_ptr(rm->GetImpl())); + std::ostringstream oss; oss << ret1; + return oss.str(); + } + return std::string(); +} std::string GetContainerManagerInstance() { diff --git a/src/Launcher/KernelLauncher.hxx b/src/Launcher/KernelLauncher.hxx index 9963f0bb3..d7f8c880e 100644 --- a/src/Launcher/KernelLauncher.hxx +++ b/src/Launcher/KernelLauncher.hxx @@ -21,6 +21,7 @@ #include +std::string RetrieveInternalInstanceOfLocalCppResourcesManager(); std::string GetContainerManagerInstance(); std::string GetResourcesManagerInstance(); std::string GetExternalServerInstance(); diff --git a/src/Launcher/KernelLauncher.i b/src/Launcher/KernelLauncher.i index 2e40c1f5d..7a97e9a11 100644 --- a/src/Launcher/KernelLauncher.i +++ b/src/Launcher/KernelLauncher.i @@ -27,6 +27,7 @@ %inline { + std::string RetrieveInternalInstanceOfLocalCppResourcesManager(); std::string GetContainerManagerInstance(); std::string GetResourcesManagerInstance(); std::string GetExternalServerInstance(); diff --git a/src/Launcher/SALOME_Launcher.hxx b/src/Launcher/SALOME_Launcher.hxx index bcb084953..188cc5e3e 100644 --- a/src/Launcher/SALOME_Launcher.hxx +++ b/src/Launcher/SALOME_Launcher.hxx @@ -80,6 +80,8 @@ public: void DeclareUsingSalomeSession(); + SALOME_ResourcesManager *getResourcesManager() const { return _ResManager; } + static const char *_LauncherNameInNS; static JobParameters_cpp diff --git a/src/Launcher_SWIG/Launcher.i b/src/Launcher_SWIG/Launcher.i index 72913846d..f01e3126e 100644 --- a/src/Launcher_SWIG/Launcher.i +++ b/src/Launcher_SWIG/Launcher.i @@ -23,6 +23,8 @@ #include "Launcher.hxx" #include "ResourcesManager.hxx" +#include + struct ResourceDefinition_cpp { public: @@ -44,6 +46,17 @@ public: bool can_run_containers; std::string working_directory; }; + +std::shared_ptr HandleToLocalInstance(const std::string& ptrInStringFrmt) +{ + std::istringstream iss(ptrInStringFrmt); + void *zePtr(nullptr); + iss >> zePtr; + std::shared_ptr *effPtr = reinterpret_cast *>(zePtr); + std::shared_ptr ret(*effPtr); + delete effPtr; + return ret; +} %} %include "std_string.i" @@ -159,6 +172,8 @@ class ResourcesManager_cpp public: ResourcesManager_cpp(const char *xmlFilePath); std::vector GetFittingResources(const resourceParams& params); + void WriteInXmlFile(std::string xml_file); + void DeleteAllResourcesInCatalog(); %extend { ResourceDefinition_cpp GetResourceDefinition(const std::string& name) @@ -186,9 +201,56 @@ public: return swig_result; } + + void DeleteResourceInCatalog(const std::string& name) + { + $self->DeleteResourceInCatalog(name.c_str()); + } + + void AddResourceInCatalog (const ResourceDefinition_cpp& new_resource) + { + ParserResourcesType new_resource_cpp; + new_resource_cpp.Name = new_resource.name; + new_resource_cpp.HostName = new_resource.hostname; + new_resource_cpp.setResourceTypeStr( new_resource.type ); + new_resource_cpp.setAccessProtocolTypeStr( new_resource.protocol ); + new_resource_cpp.UserName = new_resource.username; + new_resource_cpp.AppliPath = new_resource.applipath; + new_resource_cpp.OS = new_resource.OS; + new_resource_cpp.DataForSort._memInMB = new_resource.mem_mb; + new_resource_cpp.DataForSort._CPUFreqMHz = new_resource.cpu_clock; + new_resource_cpp.DataForSort._nbOfNodes = new_resource.nb_node; + new_resource_cpp.DataForSort._nbOfProcPerNode = new_resource.nb_proc_per_node; + new_resource_cpp.setBatchTypeStr(new_resource.batch); + new_resource_cpp.setMpiImplTypeStr(new_resource.mpiImpl); + new_resource_cpp.setClusterInternalProtocolStr(new_resource.iprotocol); + new_resource_cpp.can_launch_batch_jobs = new_resource.can_launch_batch_jobs; + new_resource_cpp.can_run_containers = new_resource.can_run_containers; + new_resource_cpp.working_directory = new_resource.working_directory; + $self->AddResourceInCatalog(new_resource_cpp); + } + + void ParseXmlFiles() + { + $self->ParseXmlFiles(); + } + + std::vector GetListOfEntries() const + { + const MapOfParserResourcesType& allRes = $self->GetList(); + std::vector ret; + for(auto it : allRes) + ret.push_back(it.first); + return ret; + } } }; +%inline +{ + std::shared_ptr HandleToLocalInstance(const std::string& ptrInStringFrmt); +} + %exception { try @@ -229,3 +291,83 @@ public: long createJobWithFile(std::string xmlExecuteFile, std::string clusterName); void SetResourcesManager(std::shared_ptr& rm ); }; + +%pythoncode %{ +def CreateSSHContainerResource(hostname,applipath,nbOfNodes=1): + return CreateContainerResource(hostname,applipath,"ssh",nbOfNodes) + +def CreateSRUNContainerResource(hostname,applipath,nbOfNodes=1): + return CreateContainerResource(hostname,applipath,"srun",nbOfNodes) + +def CreateContainerResource(hostname,applipath,protocol,nbOfNodes=1): + import getpass + ret = ResourceDefinition_cpp() + ret.name = hostname.split(".")[0] + ret.hostname = ret.name + ret.protocol = protocol + ret.applipath = applipath + ret.nb_node = nbOfNodes + ret.nb_proc_per_node = 1 + ret.can_run_containers = True + ret.can_launch_batch_jobs = False + ret.mpiImpl = "no mpi" + ret.iprotocol = protocol + ret.type = "single_machine" + ret.username = getpass.getuser() + return ret + +def ResourceDefinition_cpp_repr(self): + pat0 = "{} = {}" + pat1 = "{} = \"{}\"" + data = [("name","name",pat0), + ("hostname","hostname",pat0), + ("type","type",pat0), + ("protocol","protocol",pat0), + ("userName","username",pat0), + ("appliPath","applipath",pat1), + ("mpi","mpiImpl",pat0), + ("nbOfNodes","nb_node",pat0), + ("nbOfProcPerNode","nb_proc_per_node",pat0), + ("canRunContainer","can_run_containers",pat0) + ] + ret = [c.format(a,getattr(self,b)) for a,b,c in data] + return "\n".join( ret ) + +def ResourcesManager_cpp_GetList(self): + return {name:self.GetResourceDefinition(name) for name in self.GetListOfEntries()} + +def ResourcesManager_cpp___getitem__(self,name): + return self.GetResourceDefinition(name) + +def ResourcesManager_cpp___repr__(self): + return str( self.GetList() ) + +def RetrieveRMCppSingleton(): + import KernelLauncher + return HandleToLocalInstance( KernelLauncher.RetrieveInternalInstanceOfLocalCppResourcesManager() ) + +def GetPlayGroundInsideASlurmJob(): + import subprocess as sp + cont = sp.check_output(["srun","hostname"]) + nodesMul = [elt for elt in cont.decode().split("\n") if elt != ""] + from collections import defaultdict + d = defaultdict(int) + for elt in nodesMul: + d[elt]+=1 + return d + +def BuildCatalogFromScratch(protocol): + import os + d = GetPlayGroundInsideASlurmJob() + rmcpp = RetrieveRMCppSingleton() + rmcpp.DeleteAllResourcesInCatalog() + for k,v in d.items(): + contRes = CreateContainerResource(hostname=k,applipath=os.environ["APPLI"],protocol=protocol,nbOfNodes=v) + rmcpp.AddResourceInCatalog(contRes) + +ResourceDefinition_cpp.repr = ResourceDefinition_cpp_repr +ResourceDefinition_cpp.__repr__ = ResourceDefinition_cpp_repr +ResourcesManager_cpp.GetList = ResourcesManager_cpp_GetList +ResourcesManager_cpp.__getitem__ = ResourcesManager_cpp___getitem__ +ResourcesManager_cpp.__repr__ = ResourcesManager_cpp___repr__ +%} diff --git a/src/ResourcesManager/ResourcesManager.cxx b/src/ResourcesManager/ResourcesManager.cxx index 6e8535a7e..fa528d523 100644 --- a/src/ResourcesManager/ResourcesManager.cxx +++ b/src/ResourcesManager/ResourcesManager.cxx @@ -355,6 +355,11 @@ ResourcesManager_cpp::AddResourceInCatalog(const ParserResourcesType & new_resou _resourcesList[new_resource.Name] = new_resource; } +void ResourcesManager_cpp::DeleteAllResourcesInCatalog() +{ + _resourcesList.clear(); +} + //============================================================================= /*! * Deletes a resource from the catalog diff --git a/src/ResourcesManager/ResourcesManager.hxx b/src/ResourcesManager/ResourcesManager.hxx index d72308f25..95e13c541 100644 --- a/src/ResourcesManager/ResourcesManager.hxx +++ b/src/ResourcesManager/ResourcesManager.hxx @@ -82,6 +82,8 @@ class RESOURCESMANAGER_EXPORT ResourcesManager_cpp std::string Find(const std::string& policy, const std::vector& listOfResources) const; void AddResourceInCatalog (const ParserResourcesType & new_resource); + + void DeleteAllResourcesInCatalog(); void DeleteResourceInCatalog(const char * name); -- 2.39.2