]> SALOME platform Git repositories - modules/kernel.git/commitdiff
Salome HOME
debug for CCRT BR_FSH_DEV BR_PARAVIS_LOT1_2 V6_0_0
authorsecher <secher>
Fri, 18 Dec 2009 15:11:33 +0000 (15:11 +0000)
committersecher <secher>
Fri, 18 Dec 2009 15:11:33 +0000 (15:11 +0000)
src/Container/SALOME_ContainerManager.cxx
src/Container/SALOME_ContainerManager.hxx
src/Launcher/Launcher_Job_SALOME.cxx

index 0d6d481198c5c5b617b23980be4b385f26983313..f59f7b814fd1f8655b7dd5a2b181cb5a794ad344 100644 (file)
@@ -44,6 +44,9 @@ using namespace std;
 const char *SALOME_ContainerManager::_ContainerManagerNameInNS = 
   "/ContainerManager";
 
+omni_mutex SALOME_ContainerManager::_numInstanceMutex;
+
+
 //=============================================================================
 /*! 
  *  Constructor
@@ -53,8 +56,7 @@ const char *SALOME_ContainerManager::_ContainerManagerNameInNS =
  */
 //=============================================================================
 
-SALOME_ContainerManager::SALOME_ContainerManager(CORBA::ORB_ptr orb, PortableServer::POA_var poa, 
-                                                 SALOME_ResourcesManager *rm, SALOME_NamingService *ns)
+SALOME_ContainerManager::SALOME_ContainerManager(CORBA::ORB_ptr orb, PortableServer::POA_var poa, SALOME_ResourcesManager *rm, SALOME_NamingService *ns):_nbprocUsed(0)
 {
   MESSAGE("constructor");
   _NS = ns;
@@ -220,12 +222,12 @@ SALOME_ContainerManager::GiveContainer(const Engines::ContainerParameters& param
     {
       if (mode == "find")
       {
-        MESSAGE("[GiveContainer] no container found");
-        return ret;
+       MESSAGE("[GiveContainer] no container found");
+       return ret;
       }
       else
       {
-        mode = "start";
+       mode = "start";
       }
     }
   }
@@ -243,8 +245,8 @@ SALOME_ContainerManager::GiveContainer(const Engines::ContainerParameters& param
       Engines::Container_ptr cont = FindContainer(params, possibleResources[i].in());
       try
       {
-        if(!cont->_non_existent())
-          local_resources.push_back(string(possibleResources[i]));
+       if(!cont->_non_existent())
+         local_resources.push_back(string(possibleResources[i]));
       }
       catch(CORBA::Exception&) {}
     }
@@ -273,13 +275,27 @@ SALOME_ContainerManager::GiveContainer(const Engines::ContainerParameters& param
   }
   MESSAGE("[GiveContainer] Resource selected is: " << resource_selected);
 
+  _numInstanceMutex.lock();
+
   // Step 5: get container in the naming service
   Engines::ResourceDefinition_var resource_definition = _ResManager->GetResourceDefinition(resource_selected.c_str());
   std::string hostname(resource_definition->name.in());
   std::string containerNameInNS;
-  if(params.isMPI)
+  if(params.isMPI){
+    int nbproc;
+    if ( (params.resource_params.nb_node <= 0) && (params.resource_params.nb_proc_per_node <= 0) )
+      nbproc = 1;
+    else if ( params.resource_params.nb_node == 0 )
+      nbproc = params.resource_params.nb_proc_per_node;
+    else if ( params.resource_params.nb_proc_per_node == 0 )
+      nbproc = params.resource_params.nb_node;
+    else
+      nbproc = params.resource_params.nb_node * params.resource_params.nb_proc_per_node;
+    if( getenv("LIBBATCH_NODEFILE") != NULL )
+      machinesFile(nbproc);
     // A mpi parallel container register on zero node in NS
     containerNameInNS = _NS->BuildContainerNameForNS(params, GetMPIZeroNode(hostname).c_str());
+  }
   else
     containerNameInNS = _NS->BuildContainerNameForNS(params, hostname.c_str());
   MESSAGE("[GiveContainer] Container name in the naming service: " << containerNameInNS);
@@ -295,13 +311,15 @@ SALOME_ContainerManager::GiveContainer(const Engines::ContainerParameters& param
       Engines::Container_var cont=Engines::Container::_narrow(obj);
       if(!cont->_non_existent())
       {
-        if(std::string(params.mode.in())=="getorstart" or std::string(params.mode.in())=="get")
-          return cont._retn(); /* the container exists and params.mode is getorstart or get use it*/
-        else
-        {
-          INFOS("[GiveContainer] A container is already registered with the name: " << containerNameInNS << ", shutdown the existing container");
-          cont->Shutdown(); // shutdown the registered container if it exists
-        }
+       if(std::string(params.mode.in())=="getorstart" or std::string(params.mode.in())=="get"){
+         _numInstanceMutex.unlock();
+         return cont._retn(); /* the container exists and params.mode is getorstart or get use it*/
+       }
+       else
+       {
+         INFOS("[GiveContainer] A container is already registered with the name: " << containerNameInNS << ", shutdown the existing container");
+         cont->Shutdown(); // shutdown the registered container if it exists
+       }
       }
     }
     catch(CORBA::Exception&)
@@ -318,6 +336,7 @@ SALOME_ContainerManager::GiveContainer(const Engines::ContainerParameters& param
   if (std::string(local_params.parallelLib.in()) != "")
   {
     INFOS("[GiveContainer] PaCO++ container are not currently available");
+    _numInstanceMutex.unlock();
     return ret;
   }
   // Classic or Exe ?
@@ -331,6 +350,7 @@ SALOME_ContainerManager::GiveContainer(const Engines::ContainerParameters& param
     if (CORBA::is_nil (Catalog))
     {
       INFOS("[GiveContainer] Module Catalog is not found -> cannot launch a container");
+      _numInstanceMutex.unlock();
       return ret;
     }
     // Loop through component list
@@ -340,39 +360,47 @@ SALOME_ContainerManager::GiveContainer(const Engines::ContainerParameters& param
       SALOME_ModuleCatalog::Acomponent_var compoInfo = Catalog->GetComponent(compoi);
       if (CORBA::is_nil (compoInfo))
       {
-        continue;
+       continue;
       }
       SALOME_ModuleCatalog::ImplType impl=compoInfo->implementation_type();
       container_exe_tmp=compoInfo->implementation_name();
       if(impl==SALOME_ModuleCatalog::CEXE)
       {
-        if(found)
-        {
-          INFOS("ContainerManager Error: you can't have 2 CEXE component in the same container" );
-          return Engines::Container::_nil();
-        }
-        MESSAGE("[GiveContainer] Exe container found !: " << container_exe_tmp);
-        container_exe = container_exe_tmp.in();
-        found=1;
+       if(found)
+       {
+         INFOS("ContainerManager Error: you can't have 2 CEXE component in the same container" );
+         _numInstanceMutex.unlock();
+         return Engines::Container::_nil();
+       }
+       MESSAGE("[GiveContainer] Exe container found !: " << container_exe_tmp);
+       container_exe = container_exe_tmp.in();
+       found=1;
       }
     }
   }
   catch (ServiceUnreachable&)
   {
     INFOS("Caught exception: Naming Service Unreachable");
+    _numInstanceMutex.unlock();
     return ret;
   }
   catch (...)
   {
     INFOS("Caught unknown exception.");
+    _numInstanceMutex.unlock();
     return ret;
   }
 
   // Step 8: start a new container
   MESSAGE("[GiveContainer] Try to launch a new container on " << resource_selected);
   std::string command;
-  if(hostname == Kernel_Utils::GetHostname())
+  // if a parallel container is launched in batch job, command is: "mpirun -np nbproc -machinefile nodesfile SALOME_MPIContainer"
+  if( getenv("LIBBATCH_NODEFILE") != NULL && params.isMPI )
+    command = BuildCommandToLaunchLocalContainer(params,container_exe);
+  // if a container is launched on localhost, command is "SALOME_Container" or "mpirun -np nbproc SALOME_MPIContainer"
+  else if(hostname == Kernel_Utils::GetHostname())
     command = BuildCommandToLaunchLocalContainer(params, container_exe);
+  // if a container is launched in remote mode, command is "ssh resource_selected SALOME_Container" or "ssh resource_selected mpirun -np nbproc SALOME_MPIContainer"
   else
     command = BuildCommandToLaunchRemoteContainer(resource_selected, params, container_exe);
 
@@ -404,6 +432,8 @@ SALOME_ContainerManager::GiveContainer(const Engines::ContainerParameters& param
   // launch container with a system call
   int status=system(command.c_str());
 
+  _numInstanceMutex.unlock();
+
   if (status == -1){
     MESSAGE("SALOME_ContainerManager::StartContainer rsh failed (system command status -1)");
     RmTmpFile(_TmpFileName); // command file can be removed here
@@ -555,13 +585,13 @@ SALOME_ContainerManager::BuildCommandToLaunchRemoteContainer
     if (params.isMPI)
     {
       if ((params.resource_params.nb_node <= 0) && (params.resource_params.nb_proc_per_node <= 0))
-        nbproc = 1;
+       nbproc = 1;
       else if (params.resource_params.nb_node == 0)
-        nbproc = params.resource_params.nb_proc_per_node;
+       nbproc = params.resource_params.nb_proc_per_node;
       else if (params.resource_params.nb_proc_per_node == 0)
-        nbproc = params.resource_params.nb_node;
+       nbproc = params.resource_params.nb_node;
       else
-        nbproc = params.resource_params.nb_node * params.resource_params.nb_proc_per_node;
+       nbproc = params.resource_params.nb_node * params.resource_params.nb_proc_per_node;
     }
 
     // "ssh -l user machine distantPath/runRemote.sh hostNS portNS WORKINGDIR workingdir \
@@ -606,7 +636,7 @@ SALOME_ContainerManager::BuildCommandToLaunchRemoteContainer
       command += " WORKINGDIR ";
       command += " '";
       if(wdir == "$TEMPDIR")
-        wdir="\\$TEMPDIR";
+       wdir="\\$TEMPDIR";
       command += wdir; // requested working directory
       command += "'"; 
     }
@@ -621,10 +651,10 @@ SALOME_ContainerManager::BuildCommandToLaunchRemoteContainer
       command += "-x PATH,LD_LIBRARY_PATH,OMNIORB_CONFIG,SALOME_trace ";
 #elif defined(WITHOPENMPI)
       if( getenv("OMPI_URI_FILE") == NULL )
-        command += "-x PATH -x LD_LIBRARY_PATH -x OMNIORB_CONFIG -x SALOME_trace";
+       command += "-x PATH -x LD_LIBRARY_PATH -x OMNIORB_CONFIG -x SALOME_trace";
       else{
-        command += "-x PATH -x LD_LIBRARY_PATH -x OMNIORB_CONFIG -x SALOME_trace -ompi-server file:";
-        command += getenv("OMPI_URI_FILE");
+       command += "-x PATH -x LD_LIBRARY_PATH -x OMNIORB_CONFIG -x SALOME_trace -ompi-server file:";
+       command += getenv("OMPI_URI_FILE");
       }
 #endif        
       command += " SALOME_MPIContainer ";
@@ -672,6 +702,9 @@ SALOME_ContainerManager::BuildCommandToLaunchLocalContainer
 
       o << nbproc << " ";
 
+      if( getenv("LIBBATCH_NODEFILE") != NULL )
+       o << "-machinefile " << _machinesFile << " ";
+
 #ifdef WITHLAM
       o << "-x PATH,LD_LIBRARY_PATH,OMNIORB_CONFIG,SALOME_trace ";
 #elif defined(WITHOPENMPI)
@@ -837,26 +870,6 @@ string SALOME_ContainerManager::BuildTemporaryFileName() const
   return aFileName;
 }
 
-string SALOME_ContainerManager::GetMPIZeroNode(string machine)
-{
-  int status;
-  string zeronode;
-  string cmd;
-  string tmpFile = BuildTemporaryFileName();
-
-  cmd = "ssh " + machine + " mpirun -np 1 hostname > " + tmpFile;
-
-  status = system(cmd.c_str());
-  if( status == 0 ){
-    ifstream fp(tmpFile.c_str(),ios::in);
-    fp >> zeronode;
-  }
-
-  RmTmpFile(tmpFile);
-
-  return zeronode;
-}
-
 //=============================================================================
 /*!
  *  Builds in a temporary file the script to be launched.
@@ -1688,3 +1701,48 @@ SALOME_ContainerManager::BuildCommandToLaunchParallelContainer(const std::string
 }
 #endif
 
+string SALOME_ContainerManager::GetMPIZeroNode(string machine)
+{
+  int status;
+  string zeronode;
+  string cmd;
+  string tmpFile = BuildTemporaryFileName();
+
+  if( getenv("LIBBATCH_NODEFILE") == NULL )
+    cmd = "ssh " + machine + " mpirun -np 1 hostname > " + tmpFile;
+  else
+    cmd = "mpirun -np 1 -machinefile " + _machinesFile + " hostname > " + tmpFile;
+
+  status = system(cmd.c_str());
+  if( status == 0 ){
+    ifstream fp(tmpFile.c_str(),ios::in);
+    fp >> zeronode;
+  }
+
+  RmTmpFile(tmpFile);
+
+  return zeronode;
+}
+
+void SALOME_ContainerManager::machinesFile(const int nbproc)
+{
+  string tmp;
+  string nodesFile = getenv("LIBBATCH_NODEFILE");
+  _machinesFile = Kernel_Utils::GetTmpFileName();
+  ifstream fpi(nodesFile.c_str(),ios::in);
+  ofstream fpo(_machinesFile.c_str(),ios::out);
+
+  for(int i=0;i<_nbprocUsed;i++)
+    fpi >> tmp;
+
+  for(int i=0;i<nbproc;i++)
+    if( fpi >> tmp )
+      fpo << tmp << endl;
+    else
+      throw SALOME_Exception("You ask more processes than batch session have allocated!");
+
+  _nbprocUsed += nbproc;
+  fpi.close();
+  fpo.close();
+
+}
index a0e7e19eb3961b3849395afd16ca6eaa7ad87219..30c76ef8b6f95c183f73951a0b44c61a4a232b05 100644 (file)
@@ -88,6 +88,8 @@ protected:
 
   std::string GetMPIZeroNode(std::string machine);
 
+  void machinesFile(const int nbproc);
+
   // For PacO++ Parallel extension
   typedef std::vector<std::string> actual_launch_machine_t;
   std::string BuildCommandToLaunchParallelContainer(const std::string& exe_name, 
@@ -115,5 +117,13 @@ protected:
   //! different behaviour if $APPLI exists (SALOME Application) 
   bool _isAppliSalomeDefined;
 
+  //! attribute that contains the number of processes used in batch mode by MPI containers
+  int _nbprocUsed;
+
+  //! attributes that contains the machinefile for MPI containers
+  std::string _machinesFile;
+
+  static omni_mutex _numInstanceMutex ; // lib and instance protection
+
 };
 #endif
index 9208259f0abf30ce77cf374258eac1b0ba695f35..84ab93569e874d09c0f0e8c335bb39bd23e2cea7 100644 (file)
@@ -77,10 +77,10 @@ Launcher::Job_SALOME::buildSalomeScript(Batch::Parametre params)
   launch_script_stream << "CATALOG_FILE=" << work_directory << "/CatalogResources_" << _launch_date << ".xml" << std::endl;
   launch_script_stream << "export USER_CATALOG_RESOURCES_FILE=" << "$CATALOG_FILE" << std::endl;
   launch_script_stream << "echo '<!DOCTYPE ResourcesCatalog>'  > $CATALOG_FILE" << std::endl;
-  launch_script_stream << "echo '<resources>'                 >> $CATALOG_FILE" << std::endl;   
+  launch_script_stream << "echo '<resources>'                 >> $CATALOG_FILE" << std::endl;  
   launch_script_stream << "cat $LIBBATCH_NODEFILE | sort -u | while read host"  << std::endl;
   launch_script_stream << "do"                                                  << std::endl;
-  launch_script_stream << "echo '<resource hostname='\\\"$host\\\"                               >> $CATALOG_FILE" << std::endl;
+  launch_script_stream << "echo '<machine hostname='\\\"$host\\\"                               >> $CATALOG_FILE" << std::endl;
   launch_script_stream << "echo '         protocol=\"" << resource_protocol               << "\"' >> $CATALOG_FILE" << std::endl;
   launch_script_stream << "echo '         userName=\"" << _resource_definition.UserName   << "\"' >> $CATALOG_FILE" << std::endl;
   launch_script_stream << "echo '         appliPath=\"" << _resource_definition.AppliPath << "\"' >> $CATALOG_FILE" << std::endl;
@@ -92,17 +92,17 @@ Launcher::Job_SALOME::buildSalomeScript(Batch::Parametre params)
   // Launch SALOME with an appli
   launch_script_stream << _resource_definition.AppliPath << "/runAppli --terminal  --ns-port-log=" << launch_date_port_file <<  " > logs/salome_" << _launch_date << ".log 2>&1" << std::endl;
   launch_script_stream << "current=0\n"
-                       << "stop=20\n" 
-                       << "while ! test -f " << _resource_definition.AppliPath << "/" << launch_date_port_file << "\n"
-                       << "do\n"
-                       << "  sleep 2\n"
-                       << "  let current=current+1\n"
-                       << "  if [ \"$current\" -eq \"$stop\" ] ; then\n"
-                       << "    echo Error Naming Service failed ! >&2\n"
-                       << "    exit\n"
-                       << "  fi\n"
-                       << "done\n"
-                       << "appli_port=`cat " << _resource_definition.AppliPath << "/" << launch_date_port_file << "`\n";
+                      << "stop=20\n" 
+                      << "while ! test -f " << _resource_definition.AppliPath << "/" << launch_date_port_file << "\n"
+                      << "do\n"
+                      << "  sleep 2\n"
+                      << "  let current=current+1\n"
+                      << "  if [ \"$current\" -eq \"$stop\" ] ; then\n"
+                      << "    echo Error Naming Service failed ! >&2\n"
+                      << "    exit\n"
+                      << "  fi\n"
+                      << "done\n"
+                      << "appli_port=`cat " << _resource_definition.AppliPath << "/" << launch_date_port_file << "`\n";
 
   // Call real job type
   addJobTypeSpecificScript(launch_script_stream);