Salome HOME
[EDF28561] : Set of toolkit to ease performance measures
authorAnthony Geay <anthony.geay@edf.fr>
Wed, 13 Sep 2023 12:42:24 +0000 (14:42 +0200)
committerNabil Ghodbane <nabil.ghodbane@cea.fr>
Mon, 2 Oct 2023 11:03:59 +0000 (13:03 +0200)
idl/SALOME_ContainerManager.idl
src/Container/SALOME_ContainerManager.cxx
src/Container/SALOME_ContainerManager.hxx
src/GenericObj/SALOME_GenericObj_i.cc
src/Launcher/KernelLauncher.cxx
src/Launcher/KernelLauncher.hxx
src/Launcher/KernelLauncher.i
src/Launcher/SALOME_Launcher.hxx
src/Launcher_SWIG/Launcher.i
src/ResourcesManager/ResourcesManager.cxx
src/ResourcesManager/ResourcesManager.hxx

index f4de98ae171c4f030b486dd216be495de02dd252..9251df6ab82ea1f3022d1eb6e3bb27f540d4e907 100644 (file)
@@ -89,6 +89,14 @@ interface ContainerManager
   //!  Shutdown all containers that have been launched by the container manager
   void ShutdownContainers();
 
+  long GetTimeOutToLaunchServerInSecond();
+
+  void SetTimeOutToLaunchServerInSecond(in long timeInSecond);
+
+  long GetDeltaTimeBetweenNSLookupAtLaunchTimeInMilliSecond();
+
+  void SetDeltaTimeBetweenNSLookupAtLaunchTimeInMilliSecond(in long timeInMS);
+
   void SetOverrideEnvForContainers(in KeyValDict env);
 
   KeyValDict GetOverrideEnvForContainers();
index 80940cdba2799c41bcc1a922a08a274d43abf36d..b8ad494e2982ecf9dbdb9bcf8e6a50f679684dc6 100644 (file)
@@ -41,6 +41,8 @@
 #include <sstream>
 #include <string>
 #include <queue>
+#include <thread>
+#include <chrono>
 
 #include <SALOMEconfig.h>
 #include CORBA_CLIENT_HEADER(SALOME_Session)
@@ -66,8 +68,9 @@
 
 const int SALOME_ContainerManager::TIME_OUT_TO_LAUNCH_CONT=60;
 
-const char *SALOME_ContainerManager::_ContainerManagerNameInNS =
-  "/ContainerManager";
+const int SALOME_ContainerManager::DFT_DELTA_TIME_NS_LOOKUP_IN_MS=1000;
+
+const char *SALOME_ContainerManager::_ContainerManagerNameInNS = "/ContainerManager";
 
 omni_mutex SALOME_ContainerManager::_numInstanceMutex;
 
@@ -85,11 +88,12 @@ Utils_Mutex SALOME_ContainerManager::_systemMutex;
 //=============================================================================
 
 SALOME_ContainerManager::SALOME_ContainerManager(CORBA::ORB_ptr orb, PortableServer::POA_var poa, SALOME_NamingService_Abstract *ns)
-  : _nbprocUsed(1)
+  : _nbprocUsed(1),_delta_time_ns_lookup_in_ms(DFT_DELTA_TIME_NS_LOOKUP_IN_MS)
 {
   MESSAGE("constructor");
   _NS = ns;
   _resManager = new SALOME_ResourcesManager_Client(ns);
+  _time_out_in_second = GetTimeOutToLoaunchServer();
 
   PortableServer::POAManager_var pman = poa->the_POAManager();
   _orb = CORBA::ORB::_duplicate(orb) ;
@@ -203,6 +207,26 @@ void SALOME_ContainerManager::Shutdown()
   _poa->deactivate_object(oid);
 }
 
+CORBA::Long SALOME_ContainerManager::GetTimeOutToLaunchServerInSecond()
+{
+  return this->_time_out_in_second;
+}
+
+void SALOME_ContainerManager::SetTimeOutToLaunchServerInSecond(CORBA::Long timeInSecond)
+{
+  this->_time_out_in_second = timeInSecond;
+}
+
+CORBA::Long SALOME_ContainerManager::GetDeltaTimeBetweenNSLookupAtLaunchTimeInMilliSecond()
+{
+  return this->_delta_time_ns_lookup_in_ms;
+}
+
+void SALOME_ContainerManager::SetDeltaTimeBetweenNSLookupAtLaunchTimeInMilliSecond(CORBA::Long timeInMS)
+{
+  this->_delta_time_ns_lookup_in_ms = timeInMS;
+}
+
 //=============================================================================
 //! Loop on all the containers listed in naming service, ask shutdown on each
 /*! CORBA Method:
@@ -670,11 +694,13 @@ SALOME_ContainerManager::LaunchContainer(const Engines::ContainerParameters& par
   else
     {
       // Step 4: Wait for the container
-      int count(GetTimeOutToLoaunchServer());
-      INFOS("[GiveContainer] waiting " << count << " second steps container " << containerNameInNS);
+      double nbTurn = ( (double)this->_time_out_in_second ) * ( 1000.0 / ( (double) this->_delta_time_ns_lookup_in_ms) );
+      int count( (int)nbTurn );
+      INFOS("[GiveContainer] # attempts : " << count << " name in NS : \"" << containerNameInNS << "\"");
+      INFOS("[GiveContainer] # attempts : Time in second before time out : " << this->_time_out_in_second << " Delta time in ms between NS lookup : " << this->_delta_time_ns_lookup_in_ms);
       while (CORBA::is_nil(ret) && count)
         {
-          SleepInSecond(1);
+          std::this_thread::sleep_for(std::chrono::milliseconds(_delta_time_ns_lookup_in_ms));
           count--;
           MESSAGE("[GiveContainer] step " << count << " Waiting for container on " << resource_selected << " with entry in NS = \"" << containerNameInNS << "\"" );
           CORBA::Object_var obj(_NS->Resolve(containerNameInNS.c_str()));
@@ -1154,6 +1180,10 @@ void SALOME_ContainerManager::MakeTheCommandToBeLaunchedASync(std::string& comma
 #endif
 }
 
+/*!
+ * Return in second the time out to give chance to server to be launched and
+ * to register into NS
+ */
 int SALOME_ContainerManager::GetTimeOutToLoaunchServer()
 {
   int count(TIME_OUT_TO_LAUNCH_CONT);
index 1e458769f2374e342bab801f2a858bd221be1b8e..67a333482df2c55960b76f931a2f8c2a22d09737 100644 (file)
@@ -61,6 +61,14 @@ public:
 
   void DeclareUsingSalomeSession() { _isSSL = false; }
 
+  CORBA::Long GetTimeOutToLaunchServerInSecond() override;
+
+  void SetTimeOutToLaunchServerInSecond(CORBA::Long timeInSecond) override;
+
+  CORBA::Long GetDeltaTimeBetweenNSLookupAtLaunchTimeInMilliSecond() override;
+
+  void SetDeltaTimeBetweenNSLookupAtLaunchTimeInMilliSecond(CORBA::Long timeInMS) override;
+
   static const char *_ContainerManagerNameInNS;
 
 protected:
@@ -197,9 +205,12 @@ public:
   static void SleepInSecond(int ellapseTimeInSecond);
  private:
   static const int TIME_OUT_TO_LAUNCH_CONT;
+  static const int DFT_DELTA_TIME_NS_LOOKUP_IN_MS;
   static Utils_Mutex _getenvMutex;
   static Utils_Mutex _systemMutex;
 private:
   std::vector< std::pair<std::string, std::string> > _override_env;
+  int _time_out_in_second;
+  int _delta_time_ns_lookup_in_ms;
 };
 #endif
index 5d14412b6d4c88fbc489b3ff051c31fe182c5787..5a5c7d88800a1a3dd3effe85ee6b62240051c2c5 100644 (file)
@@ -73,8 +73,6 @@ namespace SALOME
   */
   GenericObj_i::GenericObj_i(PortableServer::POA_ptr thePOA): myRefCounter(1)
   {
-    MESSAGE("GenericObj_i::GenericObj_i() - this = " << this <<
-           "; CORBA::is_nil(thePOA) = " << CORBA::is_nil(thePOA));
 
     if(CORBA::is_nil(thePOA)) {
 #ifndef WIN32
@@ -86,8 +84,6 @@ namespace SALOME
     else {
       myPOA = PortableServer::POA::_duplicate(thePOA);
     }
-
-    MESSAGE("GenericObj_i::GenericObj_i thePOA: " << thePOA << " myPOA: " << myPOA);
   }
 
   /*!
@@ -100,7 +96,6 @@ namespace SALOME
   */
   PortableServer::POA_ptr GenericObj_i::_default_POA()
   {
-    MESSAGE("GenericObj_i::_default_POA: " << myPOA);
     return PortableServer::POA::_duplicate(myPOA);
   }
 
index d66cb99de94a77df1fa15fcfe20ea1a755aaf3d7..5ed46f83c114bc953b5f6de202692ef3bc58d0b0 100644 (file)
 #include "SALOME_CPythonHelper.hxx"
 
 #include <cstring>
+#include <sstream>
+
+std::string RetrieveInternalInstanceOfLocalCppResourcesManager()
+{
+  SALOME_Launcher *launcher = KERNEL::getLauncherSA();
+  SALOME_ResourcesManager *rm(launcher->getResourcesManager());
+  if(rm)
+  {
+    std::shared_ptr<ResourcesManager_cpp> *ret1(new std::shared_ptr<ResourcesManager_cpp>(rm->GetImpl()));
+    std::ostringstream oss; oss << ret1;
+    return oss.str();
+  }
+  return std::string();
+}
 
 std::string GetContainerManagerInstance()
 {
index 9963f0bb30b979c612f1f17e5e8a1c958238c6ff..d7f8c880e2dd1f59db27216f10e8dd7845cdfefb 100644 (file)
@@ -21,6 +21,7 @@
 
 #include <string>
 
+std::string RetrieveInternalInstanceOfLocalCppResourcesManager();
 std::string GetContainerManagerInstance();
 std::string GetResourcesManagerInstance();
 std::string GetExternalServerInstance();
index 2e40c1f5d94c15525e732c9573385a762e05dbba..7a97e9a1130bac29281f72f80f2cad4cf85abeda 100644 (file)
@@ -27,6 +27,7 @@
 
 %inline
 {
+  std::string RetrieveInternalInstanceOfLocalCppResourcesManager();
   std::string GetContainerManagerInstance();
   std::string GetResourcesManagerInstance();
   std::string GetExternalServerInstance();
index bcb0849534b414540c6f8622c0db5d751c28ac45..188cc5e3ee31164568a6dbbd9bbd515b3499b609 100644 (file)
@@ -80,6 +80,8 @@ public:
   
   void DeclareUsingSalomeSession();
 
+  SALOME_ResourcesManager *getResourcesManager() const { return _ResManager; }
+
   static const char *_LauncherNameInNS;
 
   static JobParameters_cpp
index 72913846d65f7c86f8b5a3f382a27cd1df896b25..f01e3126ef67d510a5fa39349e0600e93dbe067d 100644 (file)
@@ -23,6 +23,8 @@
 #include "Launcher.hxx"
 #include "ResourcesManager.hxx"
 
+#include <sstream>
+
 struct ResourceDefinition_cpp
 {
 public:
@@ -44,6 +46,17 @@ public:
   bool can_run_containers;
   std::string working_directory;
 };
+
+std::shared_ptr<ResourcesManager_cpp> HandleToLocalInstance(const std::string& ptrInStringFrmt)
+{
+  std::istringstream iss(ptrInStringFrmt);
+  void *zePtr(nullptr);
+  iss >> zePtr;
+  std::shared_ptr<ResourcesManager_cpp> *effPtr = reinterpret_cast<std::shared_ptr<ResourcesManager_cpp> *>(zePtr);
+  std::shared_ptr<ResourcesManager_cpp> ret(*effPtr);
+  delete effPtr;
+  return ret;
+}
 %}
 
 %include "std_string.i"
@@ -159,6 +172,8 @@ class ResourcesManager_cpp
 public:
   ResourcesManager_cpp(const char *xmlFilePath);
   std::vector<std::string> GetFittingResources(const resourceParams& params);
+  void WriteInXmlFile(std::string xml_file);
+  void DeleteAllResourcesInCatalog();
 %extend
 {
   ResourceDefinition_cpp GetResourceDefinition(const std::string& name)
@@ -186,9 +201,56 @@ public:
 
     return swig_result;
   }
+
+  void DeleteResourceInCatalog(const std::string& name)
+  {
+    $self->DeleteResourceInCatalog(name.c_str());
+  }
+  
+  void AddResourceInCatalog (const ResourceDefinition_cpp& new_resource)
+  {
+    ParserResourcesType new_resource_cpp;
+    new_resource_cpp.Name = new_resource.name;
+    new_resource_cpp.HostName = new_resource.hostname;
+    new_resource_cpp.setResourceTypeStr( new_resource.type );
+    new_resource_cpp.setAccessProtocolTypeStr( new_resource.protocol );
+    new_resource_cpp.UserName = new_resource.username;
+    new_resource_cpp.AppliPath = new_resource.applipath;
+    new_resource_cpp.OS = new_resource.OS;
+    new_resource_cpp.DataForSort._memInMB = new_resource.mem_mb;
+    new_resource_cpp.DataForSort._CPUFreqMHz = new_resource.cpu_clock;
+    new_resource_cpp.DataForSort._nbOfNodes = new_resource.nb_node;
+    new_resource_cpp.DataForSort._nbOfProcPerNode = new_resource.nb_proc_per_node;
+    new_resource_cpp.setBatchTypeStr(new_resource.batch);
+    new_resource_cpp.setMpiImplTypeStr(new_resource.mpiImpl);
+    new_resource_cpp.setClusterInternalProtocolStr(new_resource.iprotocol);
+    new_resource_cpp.can_launch_batch_jobs = new_resource.can_launch_batch_jobs;
+    new_resource_cpp.can_run_containers = new_resource.can_run_containers;
+    new_resource_cpp.working_directory = new_resource.working_directory;
+    $self->AddResourceInCatalog(new_resource_cpp);
+  }
+  
+  void ParseXmlFiles()
+  {
+    $self->ParseXmlFiles();
+  }
+  
+  std::vector<std::string> GetListOfEntries() const
+  {
+    const MapOfParserResourcesType& allRes = $self->GetList();
+    std::vector<std::string> ret;
+    for(auto it : allRes)
+      ret.push_back(it.first);
+    return ret;
+  }
 }
 };
 
+%inline
+{
+  std::shared_ptr<ResourcesManager_cpp> HandleToLocalInstance(const std::string& ptrInStringFrmt);
+}
+
 %exception
 {
   try
@@ -229,3 +291,83 @@ public:
   long createJobWithFile(std::string xmlExecuteFile, std::string clusterName);
   void SetResourcesManager(std::shared_ptr<ResourcesManager_cpp>& rm );
 };
+
+%pythoncode %{
+def CreateSSHContainerResource(hostname,applipath,nbOfNodes=1):
+  return CreateContainerResource(hostname,applipath,"ssh",nbOfNodes)
+
+def CreateSRUNContainerResource(hostname,applipath,nbOfNodes=1):
+  return CreateContainerResource(hostname,applipath,"srun",nbOfNodes)
+
+def CreateContainerResource(hostname,applipath,protocol,nbOfNodes=1):
+  import getpass
+  ret = ResourceDefinition_cpp()
+  ret.name = hostname.split(".")[0]
+  ret.hostname = ret.name
+  ret.protocol = protocol
+  ret.applipath = applipath
+  ret.nb_node = nbOfNodes
+  ret.nb_proc_per_node = 1
+  ret.can_run_containers = True
+  ret.can_launch_batch_jobs = False
+  ret.mpiImpl = "no mpi"
+  ret.iprotocol = protocol
+  ret.type = "single_machine"
+  ret.username = getpass.getuser()
+  return ret
+
+def ResourceDefinition_cpp_repr(self):
+  pat0 = "{} = {}"
+  pat1 = "{} = \"{}\""
+  data = [("name","name",pat0),
+  ("hostname","hostname",pat0),
+  ("type","type",pat0),
+  ("protocol","protocol",pat0),
+  ("userName","username",pat0),
+  ("appliPath","applipath",pat1),
+  ("mpi","mpiImpl",pat0),
+  ("nbOfNodes","nb_node",pat0),
+  ("nbOfProcPerNode","nb_proc_per_node",pat0),
+  ("canRunContainer","can_run_containers",pat0)
+  ]
+  ret = [c.format(a,getattr(self,b)) for a,b,c in data]
+  return "\n".join( ret )
+
+def ResourcesManager_cpp_GetList(self):
+  return {name:self.GetResourceDefinition(name) for name in self.GetListOfEntries()}
+
+def ResourcesManager_cpp___getitem__(self,name):
+  return self.GetResourceDefinition(name)
+
+def ResourcesManager_cpp___repr__(self):
+  return str( self.GetList() )
+
+def RetrieveRMCppSingleton():
+  import KernelLauncher
+  return HandleToLocalInstance( KernelLauncher.RetrieveInternalInstanceOfLocalCppResourcesManager() )
+
+def GetPlayGroundInsideASlurmJob():
+  import subprocess as sp
+  cont = sp.check_output(["srun","hostname"])
+  nodesMul = [elt for elt in cont.decode().split("\n") if elt != ""]
+  from collections import defaultdict
+  d = defaultdict(int)
+  for elt in nodesMul:
+      d[elt]+=1
+  return d
+
+def BuildCatalogFromScratch(protocol):
+  import os
+  d = GetPlayGroundInsideASlurmJob()
+  rmcpp = RetrieveRMCppSingleton()
+  rmcpp.DeleteAllResourcesInCatalog()
+  for k,v in d.items():
+      contRes = CreateContainerResource(hostname=k,applipath=os.environ["APPLI"],protocol=protocol,nbOfNodes=v)
+      rmcpp.AddResourceInCatalog(contRes)
+
+ResourceDefinition_cpp.repr = ResourceDefinition_cpp_repr
+ResourceDefinition_cpp.__repr__ = ResourceDefinition_cpp_repr
+ResourcesManager_cpp.GetList = ResourcesManager_cpp_GetList
+ResourcesManager_cpp.__getitem__ = ResourcesManager_cpp___getitem__
+ResourcesManager_cpp.__repr__ = ResourcesManager_cpp___repr__
+%}
index 6e8535a7e6b81fb5dc731d75e06da9125f3971c2..fa528d5234948e4e81449f385419a42c8d6dde5e 100644 (file)
@@ -355,6 +355,11 @@ ResourcesManager_cpp::AddResourceInCatalog(const ParserResourcesType & new_resou
   _resourcesList[new_resource.Name] = new_resource;
 }
 
+void ResourcesManager_cpp::DeleteAllResourcesInCatalog()
+{
+  _resourcesList.clear();
+}
+
 //=============================================================================
 /*!
  *  Deletes a resource from the catalog
index d72308f253d8f38ae3e19d11b590444b2e9d983c..95e13c541cb67fcba99955b6bebb9cbc8a2e07d3 100644 (file)
@@ -82,6 +82,8 @@ class RESOURCESMANAGER_EXPORT ResourcesManager_cpp
     std::string Find(const std::string& policy, const std::vector<std::string>& listOfResources) const;
 
     void AddResourceInCatalog (const ParserResourcesType & new_resource);
+    
+    void DeleteAllResourcesInCatalog();
 
     void DeleteResourceInCatalog(const char * name);