[cig-commits] [commit] devel: Copied comm splitters from cartesian. updates parameters (f957c11)
cig_noreply at geodynamics.org
cig_noreply at geodynamics.org
Fri Dec 5 07:22:39 PST 2014
Repository : https://github.com/geodynamics/specfem3d_globe
On branch : devel
Link : https://github.com/geodynamics/specfem3d_globe/compare/b9fb1aa33196d161098710455fadbb4ed91c5e47...897de40783bd1a4630c2aacd3fa5f8b016d4c189
>---------------------------------------------------------------
commit f957c11ca72e0d35df522f0984ca855b6e74176d
Author: Matthieu Lefebvre <ml15 at princeton.edu>
Date: Mon Dec 1 14:43:38 2014 -0500
Copied comm splitters from cartesian. updates parameters
>---------------------------------------------------------------
f957c11ca72e0d35df522f0984ca855b6e74176d
setup/constants.h.in | 10 ++++
src/shared/parallel.f90 | 142 ++++++++++++++++++++++++++++++++++++++++++++++
src/shared/shared_par.f90 | 15 +++++
3 files changed, 167 insertions(+)
diff --git a/setup/constants.h.in b/setup/constants.h.in
index 8aa046e..c9090fc 100644
--- a/setup/constants.h.in
+++ b/setup/constants.h.in
@@ -438,6 +438,16 @@
!!-----------------------------------------------------------
!!
+!! directory structure
+!!
+!!-----------------------------------------------------------
+
+! paths for inputs and outputs files
+ character(len=*), parameter :: OUTPUT_FILES_PATH_BASE = './OUTPUT_FILES/'
+
+
+!!-----------------------------------------------------------
+!!
!! movie outputs
!!
!!-----------------------------------------------------------
diff --git a/src/shared/parallel.f90 b/src/shared/parallel.f90
index 5bc0542..8fccdee 100644
--- a/src/shared/parallel.f90
+++ b/src/shared/parallel.f90
@@ -25,6 +25,41 @@
!
!=====================================================================
+!! DK DK July 2014, CNRS Marseille, France:
+!! DK DK added the ability to run several calculations (several earthquakes)
+!! DK DK in an embarrassingly-parallel fashion from within the same run;
+!! DK DK this can be useful when using a very large supercomputer to compute
+!! DK DK many earthquakes in a catalog, in which case it can be better from
+!! DK DK a batch job submission point of view to start fewer and much larger jobs,
+!! DK DK each of them computing several earthquakes in parallel.
+!! DK DK To turn that option on, set parameter NUMBER_OF_SIMULTANEOUS_RUNS
+!! DK DK to a value greater than 1 in file setup/constants.h.in before
+!! DK DK configuring and compiling the code.
+!! DK DK To implement that, we create NUMBER_OF_SIMULTANEOUS_RUNS MPI sub-communicators,
+!! DK DK each of them being labeled "my_local_mpi_comm_world", and we use them
+!! DK DK in all the routines in "src/shared/parallel.f90", except in MPI_ABORT() because in that case
+!! DK DK we need to kill the entire run.
+!! DK DK When that option is on, of course the number of processor cores used to start
+!! DK DK the code in the batch system must be a multiple of NUMBER_OF_SIMULTANEOUS_RUNS,
+!! DK DK all the individual runs must use the same number of processor cores,
+!! DK DK which as usual is NPROC in the input file DATA/Par_file,
+!! DK DK and thus the total number of processor cores to request from the batch system
+!! DK DK should be NUMBER_OF_SIMULTANEOUS_RUNS * NPROC.
+!! DK DK All the runs to perform must be placed in directories called run0001, run0002, run0003 and so on
+!! DK DK (with exactly four digits).
+
+module my_mpi
+
+! main parameter module for specfem simulations
+
+ use mpi
+
+ implicit none
+
+ integer :: my_local_mpi_comm_world, my_local_mpi_comm_for_bcast
+
+end module my_mpi
+
!-------------------------------------------------------------------------------------------------
!
! MPI wrapper functions
@@ -1320,3 +1355,110 @@
end subroutine world_get_info_null
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+! create sub-communicators if needed, if running more than one earthquake from the same job.
+!! DK DK create a sub-communicator for each independent run;
+!! DK DK if there is a single run to do, then just copy the default communicator to the new one
+ subroutine world_split()
+
+ use my_mpi
+ use constants,only: MAX_STRING_LEN,NUMBER_OF_SIMULTANEOUS_RUNS,OUTPUT_FILES_PATH, &
+ IMAIN,ISTANDARD_OUTPUT,mygroup,BROADCAST_SAME_MESH_AND_MODEL,I_should_read_the_database
+
+ implicit none
+
+ integer :: sizeval,myrank,ier,key,my_group_for_bcast,my_local_rank_for_bcast,NPROC
+
+ character(len=MAX_STRING_LEN) :: path_to_add
+
+ if (NUMBER_OF_SIMULTANEOUS_RUNS <= 0) stop 'NUMBER_OF_SIMULTANEOUS_RUNS <= 0 makes no sense'
+
+ call MPI_COMM_SIZE(MPI_COMM_WORLD,sizeval,ier)
+ call MPI_COMM_RANK(MPI_COMM_WORLD,myrank,ier)
+
+ if (NUMBER_OF_SIMULTANEOUS_RUNS > 1 .and. mod(sizeval,NUMBER_OF_SIMULTANEOUS_RUNS) /= 0) &
+ stop 'the number of MPI processes is not a multiple of NUMBER_OF_SIMULTANEOUS_RUNS'
+
+ if (NUMBER_OF_SIMULTANEOUS_RUNS > 1 .and. IMAIN == ISTANDARD_OUTPUT) &
+ stop 'must not have IMAIN == ISTANDARD_OUTPUT when NUMBER_OF_SIMULTANEOUS_RUNS > 1 otherwise output to screen is mingled'
+
+ if (NUMBER_OF_SIMULTANEOUS_RUNS == 1) then
+
+ my_local_mpi_comm_world = MPI_COMM_WORLD
+
+! no broadcast of the mesh and model databases to other runs in that case
+ my_group_for_bcast = 0
+ my_local_mpi_comm_for_bcast = MPI_COMM_NULL
+
+ else
+
+!--- create a subcommunicator for each independent run
+
+ NPROC = sizeval / NUMBER_OF_SIMULTANEOUS_RUNS
+
+! create the different groups of processes, one for each independent run
+ mygroup = myrank / NPROC
+ key = myrank
+ if (mygroup < 0 .or. mygroup > NUMBER_OF_SIMULTANEOUS_RUNS-1) stop 'invalid value of mygroup'
+
+! build the sub-communicators
+ call MPI_COMM_SPLIT(MPI_COMM_WORLD, mygroup, key, my_local_mpi_comm_world, ier)
+ if (ier /= 0) stop 'error while trying to create the sub-communicators'
+
+! add the right directory for that run (group numbers start at zero, but directory names start at run0001, thus we add one)
+ write(path_to_add,"('run',i4.4,'/')") mygroup + 1
+ OUTPUT_FILES_PATH = path_to_add(1:len_trim(path_to_add))//OUTPUT_FILES_PATH(1:len_trim(OUTPUT_FILES_PATH))
+
+!--- create a subcommunicator to broadcast the identical mesh and model databases if needed
+ if (BROADCAST_SAME_MESH_AND_MODEL) then
+
+ call MPI_COMM_RANK(MPI_COMM_WORLD,myrank,ier)
+! to broadcast the model, split along similar ranks per run instead
+ my_group_for_bcast = mod(myrank,NPROC)
+ key = myrank
+ if (my_group_for_bcast < 0 .or. my_group_for_bcast > NPROC-1) stop 'invalid value of my_group_for_bcast'
+
+! build the sub-communicators
+ call MPI_COMM_SPLIT(MPI_COMM_WORLD, my_group_for_bcast, key, my_local_mpi_comm_for_bcast, ier)
+ if (ier /= 0) stop 'error while trying to create the sub-communicators'
+
+! see if that process will need to read the mesh and model database and then broadcast it to others
+ call MPI_COMM_RANK(my_local_mpi_comm_for_bcast,my_local_rank_for_bcast,ier)
+ if (my_local_rank_for_bcast > 0) I_should_read_the_database = .false.
+
+ else
+
+! no broadcast of the mesh and model databases to other runs in that case
+ my_group_for_bcast = 0
+ my_local_mpi_comm_for_bcast = MPI_COMM_NULL
+
+ endif
+
+ endif
+
+ end subroutine world_split
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+! close sub-communicators if needed, if running more than one earthquake from the same job.
+ subroutine world_unsplit()
+
+ use my_mpi
+ use constants,only: NUMBER_OF_SIMULTANEOUS_RUNS,BROADCAST_SAME_MESH_AND_MODEL
+
+ implicit none
+
+ integer :: ier
+
+ if (NUMBER_OF_SIMULTANEOUS_RUNS > 1) then
+ call MPI_COMM_FREE(my_local_mpi_comm_world,ier)
+ if (BROADCAST_SAME_MESH_AND_MODEL) call MPI_COMM_FREE(my_local_mpi_comm_for_bcast,ier)
+ endif
+
+ end subroutine world_unsplit
+
diff --git a/src/shared/shared_par.f90 b/src/shared/shared_par.f90
index d10cdd4..6c028d8 100644
--- a/src/shared/shared_par.f90
+++ b/src/shared/shared_par.f90
@@ -29,6 +29,21 @@
include "constants.h"
+ ! a negative initial value is a convention that indicates that groups
+ ! (i.e. sub-communicators, one per run) are off by default
+ integer :: mygroup = -1
+
+ ! create a copy of the original output file path, to which we may add a
+ ! "run0001/", "run0002/", "run0003/" prefix later
+ ! if NUMBER_OF_SIMULTANEOUS_RUNS > 1
+ character(len=MAX_STRING_LEN) :: OUTPUT_FILES_PATH = OUTPUT_FILES_PATH_BASE
+
+ ! if doing simultaneous runs for the same mesh and model, see who
+ ! should read the mesh and the model and broadcast it to others
+ ! we put a default value here
+ logical :: I_should_read_the_database = .true.
+
+
end module constants
!
More information about the CIG-COMMITS
mailing list