[cig-commits] [commit] devel: Copied comm splitters from cartesian. updates parameters (f957c11)

cig_noreply at geodynamics.org cig_noreply at geodynamics.org
Fri Dec 5 07:22:39 PST 2014


Repository : https://github.com/geodynamics/specfem3d_globe

On branch  : devel
Link       : https://github.com/geodynamics/specfem3d_globe/compare/b9fb1aa33196d161098710455fadbb4ed91c5e47...897de40783bd1a4630c2aacd3fa5f8b016d4c189

>---------------------------------------------------------------

commit f957c11ca72e0d35df522f0984ca855b6e74176d
Author: Matthieu Lefebvre <ml15 at princeton.edu>
Date:   Mon Dec 1 14:43:38 2014 -0500

    Copied comm splitters from cartesian. updates parameters


>---------------------------------------------------------------

f957c11ca72e0d35df522f0984ca855b6e74176d
 setup/constants.h.in      |  10 ++++
 src/shared/parallel.f90   | 142 ++++++++++++++++++++++++++++++++++++++++++++++
 src/shared/shared_par.f90 |  15 +++++
 3 files changed, 167 insertions(+)

diff --git a/setup/constants.h.in b/setup/constants.h.in
index 8aa046e..c9090fc 100644
--- a/setup/constants.h.in
+++ b/setup/constants.h.in
@@ -438,6 +438,16 @@
 
 !!-----------------------------------------------------------
 !!
+!! directory structure
+!!
+!!-----------------------------------------------------------
+
+! paths for inputs and outputs files
+  character(len=*), parameter :: OUTPUT_FILES_PATH_BASE = './OUTPUT_FILES/'
+
+
+!!-----------------------------------------------------------
+!!
 !! movie outputs
 !!
 !!-----------------------------------------------------------
diff --git a/src/shared/parallel.f90 b/src/shared/parallel.f90
index 5bc0542..8fccdee 100644
--- a/src/shared/parallel.f90
+++ b/src/shared/parallel.f90
@@ -25,6 +25,41 @@
 !
 !=====================================================================
 
+!! DK DK July 2014, CNRS Marseille, France:
+!! DK DK added the ability to run several calculations (several earthquakes)
+!! DK DK in an embarrassingly-parallel fashion from within the same run;
+!! DK DK this can be useful when using a very large supercomputer to compute
+!! DK DK many earthquakes in a catalog, in which case it can be better from
+!! DK DK a batch job submission point of view to start fewer and much larger jobs,
+!! DK DK each of them computing several earthquakes in parallel.
+!! DK DK To turn that option on, set parameter NUMBER_OF_SIMULTANEOUS_RUNS
+!! DK DK to a value greater than 1 in file setup/constants.h.in before
+!! DK DK configuring and compiling the code.
+!! DK DK To implement that, we create NUMBER_OF_SIMULTANEOUS_RUNS MPI sub-communicators,
+!! DK DK each of them being labeled "my_local_mpi_comm_world", and we use them
+!! DK DK in all the routines in "src/shared/parallel.f90", except in MPI_ABORT() because in that case
+!! DK DK we need to kill the entire run.
+!! DK DK When that option is on, of course the number of processor cores used to start
+!! DK DK the code in the batch system must be a multiple of NUMBER_OF_SIMULTANEOUS_RUNS,
+!! DK DK all the individual runs must use the same number of processor cores,
+!! DK DK which as usual is NPROC in the input file DATA/Par_file,
+!! DK DK and thus the total number of processor cores to request from the batch system
+!! DK DK should be NUMBER_OF_SIMULTANEOUS_RUNS * NPROC.
+!! DK DK All the runs to perform must be placed in directories called run0001, run0002, run0003 and so on
+!! DK DK (with exactly four digits).
+
+module my_mpi
+
+! main parameter module for specfem simulations
+
+  use mpi
+
+  implicit none
+
+  integer :: my_local_mpi_comm_world, my_local_mpi_comm_for_bcast
+
+end module my_mpi
+
 !-------------------------------------------------------------------------------------------------
 !
 ! MPI wrapper functions
@@ -1320,3 +1355,110 @@
 
   end subroutine world_get_info_null
 
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+! create sub-communicators if needed, if running more than one earthquake from the same job.
+!! DK DK create a sub-communicator for each independent run;
+!! DK DK if there is a single run to do, then just copy the default communicator to the new one
+  subroutine world_split()
+
+  use my_mpi
+  use constants,only: MAX_STRING_LEN,NUMBER_OF_SIMULTANEOUS_RUNS,OUTPUT_FILES_PATH, &
+    IMAIN,ISTANDARD_OUTPUT,mygroup,BROADCAST_SAME_MESH_AND_MODEL,I_should_read_the_database
+
+  implicit none
+
+  integer :: sizeval,myrank,ier,key,my_group_for_bcast,my_local_rank_for_bcast,NPROC
+
+  character(len=MAX_STRING_LEN) :: path_to_add
+
+  if (NUMBER_OF_SIMULTANEOUS_RUNS <= 0) stop 'NUMBER_OF_SIMULTANEOUS_RUNS <= 0 makes no sense'
+
+  call MPI_COMM_SIZE(MPI_COMM_WORLD,sizeval,ier)
+  call MPI_COMM_RANK(MPI_COMM_WORLD,myrank,ier)
+
+  if (NUMBER_OF_SIMULTANEOUS_RUNS > 1 .and. mod(sizeval,NUMBER_OF_SIMULTANEOUS_RUNS) /= 0) &
+    stop 'the number of MPI processes is not a multiple of NUMBER_OF_SIMULTANEOUS_RUNS'
+
+  if (NUMBER_OF_SIMULTANEOUS_RUNS > 1 .and. IMAIN == ISTANDARD_OUTPUT) &
+    stop 'must not have IMAIN == ISTANDARD_OUTPUT when NUMBER_OF_SIMULTANEOUS_RUNS > 1 otherwise output to screen is mingled'
+
+  if (NUMBER_OF_SIMULTANEOUS_RUNS == 1) then
+
+    my_local_mpi_comm_world = MPI_COMM_WORLD
+
+! no broadcast of the mesh and model databases to other runs in that case
+    my_group_for_bcast = 0
+    my_local_mpi_comm_for_bcast = MPI_COMM_NULL
+
+  else
+
+!--- create a subcommunicator for each independent run
+
+    NPROC = sizeval / NUMBER_OF_SIMULTANEOUS_RUNS
+
+!   create the different groups of processes, one for each independent run
+    mygroup = myrank / NPROC
+    key = myrank
+    if (mygroup < 0 .or. mygroup > NUMBER_OF_SIMULTANEOUS_RUNS-1) stop 'invalid value of mygroup'
+
+!   build the sub-communicators
+    call MPI_COMM_SPLIT(MPI_COMM_WORLD, mygroup, key, my_local_mpi_comm_world, ier)
+    if (ier /= 0) stop 'error while trying to create the sub-communicators'
+
+!   add the right directory for that run (group numbers start at zero, but directory names start at run0001, thus we add one)
+    write(path_to_add,"('run',i4.4,'/')") mygroup + 1
+    OUTPUT_FILES_PATH = path_to_add(1:len_trim(path_to_add))//OUTPUT_FILES_PATH(1:len_trim(OUTPUT_FILES_PATH))
+
+!--- create a subcommunicator to broadcast the identical mesh and model databases if needed
+    if (BROADCAST_SAME_MESH_AND_MODEL) then
+
+      call MPI_COMM_RANK(MPI_COMM_WORLD,myrank,ier)
+!     to broadcast the model, split along similar ranks per run instead
+      my_group_for_bcast = mod(myrank,NPROC)
+      key = myrank
+      if (my_group_for_bcast < 0 .or. my_group_for_bcast > NPROC-1) stop 'invalid value of my_group_for_bcast'
+
+!     build the sub-communicators
+      call MPI_COMM_SPLIT(MPI_COMM_WORLD, my_group_for_bcast, key, my_local_mpi_comm_for_bcast, ier)
+      if (ier /= 0) stop 'error while trying to create the sub-communicators'
+
+!     see if that process will need to read the mesh and model database and then broadcast it to others
+      call MPI_COMM_RANK(my_local_mpi_comm_for_bcast,my_local_rank_for_bcast,ier)
+      if (my_local_rank_for_bcast > 0) I_should_read_the_database = .false.
+
+    else
+
+! no broadcast of the mesh and model databases to other runs in that case
+      my_group_for_bcast = 0
+      my_local_mpi_comm_for_bcast = MPI_COMM_NULL
+
+    endif
+
+  endif
+
+  end subroutine world_split
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+! close sub-communicators if needed, if running more than one earthquake from the same job.
+  subroutine world_unsplit()
+
+  use my_mpi
+  use constants,only: NUMBER_OF_SIMULTANEOUS_RUNS,BROADCAST_SAME_MESH_AND_MODEL
+
+  implicit none
+
+  integer :: ier
+
+  if (NUMBER_OF_SIMULTANEOUS_RUNS > 1) then
+    call MPI_COMM_FREE(my_local_mpi_comm_world,ier)
+    if (BROADCAST_SAME_MESH_AND_MODEL) call MPI_COMM_FREE(my_local_mpi_comm_for_bcast,ier)
+  endif
+
+  end subroutine world_unsplit
+
diff --git a/src/shared/shared_par.f90 b/src/shared/shared_par.f90
index d10cdd4..6c028d8 100644
--- a/src/shared/shared_par.f90
+++ b/src/shared/shared_par.f90
@@ -29,6 +29,21 @@
 
   include "constants.h"
 
+  ! a negative initial value is a convention that indicates that groups 
+  ! (i.e. sub-communicators, one per run) are off by default
+  integer :: mygroup = -1
+
+  ! create a copy of the original output file path, to which we may add a
+  ! "run0001/", "run0002/", "run0003/" prefix later
+  ! if NUMBER_OF_SIMULTANEOUS_RUNS > 1
+  character(len=MAX_STRING_LEN) :: OUTPUT_FILES_PATH = OUTPUT_FILES_PATH_BASE
+
+  ! if doing simultaneous runs for the same mesh and model, see who
+  ! should read the mesh and the model and broadcast it to others
+  ! we put a default value here
+  logical :: I_should_read_the_database = .true.
+
+
   end module constants
 
 !



More information about the CIG-COMMITS mailing list