[cig-commits] r19708 - in seismo/3D/SPECFEM3D_GLOBE/trunk: doc/USER_MANUAL src/specfem3D

Fri Mar 2 08:19:58 PST 2012

Author: dkomati1
Date: 2012-03-02 08:19:57 -0800 (Fri, 02 Mar 2012)
New Revision: 19708

Modified:
   seismo/3D/SPECFEM3D_GLOBE/trunk/doc/USER_MANUAL/manual_SPECFEM3D_GLOBE.pdf
   seismo/3D/SPECFEM3D_GLOBE/trunk/doc/USER_MANUAL/manual_SPECFEM3D_GLOBE.tex
   seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/Makefile.in
   seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_forces_crust_mantle_Dev.F90
   seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/specfem3D.F90
Log:
added (partial) OpenMP support from Marcin Zielinski at SARA (The Netherlands)


Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/doc/USER_MANUAL/manual_SPECFEM3D_GLOBE.pdf
===================================================================
(Binary files differ)

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/doc/USER_MANUAL/manual_SPECFEM3D_GLOBE.tex
===================================================================

--- seismo/3D/SPECFEM3D_GLOBE/trunk/doc/USER_MANUAL/manual_SPECFEM3D_GLOBE.tex	2012-03-02 00:32:56 UTC (rev 19707)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/doc/USER_MANUAL/manual_SPECFEM3D_GLOBE.tex	2012-03-02 16:19:57 UTC (rev 19708)
@@ -468,6 +468,26 @@
 \noindent
 should work.
 
+\section{Adding OpenMP support in addition to MPI}
+
+OpenMP support can be enabled in addition to MPI. However, in most cases performance will not improve
+because our pure MPI implementation is already heavily optimized. In many cases, the resulting code will in fact 
+be slightly slower.
+
+To enable OpenMP, uncomment the OpenMP compiler option in two lines in file \texttt{src/specfem3D/Makefile.in}
+(before running \texttt{configure}) and also uncomment the \texttt{\#define USE\_OPENMP} statement in file\\
+\texttt{src/specfem3D/specfem3D.F90}.
+
+The DO-loop using OpenMP threads has a SCHEDULE property. The OMP\_SCHEDULE
+environment variable can set the scheduling policy of that DO-loop.
+Tests performed by Marcin Zielinski at SARA (The Netherlands) showed that often
+the best scheduling policy is DYNAMIC with the size of the chunk equal to the number of
+OpenMP threads, but most preferably being twice as the number of
+OpenMP threads (thus chunk size = 8 for 4 OpenMP threads etc).
+If OMP\_SCHEDULE is not set or is empty, the DO-loop will assume generic
+scheduling policy, which will slow down the job quite a bit. 
+
+
 \chapter{\label{cha:Running-the-Mesher}Running the Mesher \texttt{xmeshfem3D}}
 
 You are now ready to compile the mesher. In the directory with the

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/Makefile.in
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/Makefile.in	2012-03-02 00:32:56 UTC (rev 19707)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/Makefile.in	2012-03-02 16:19:57 UTC (rev 19708)
@@ -192,6 +192,8 @@
 
 xspecfem3D: $(XSPECFEM_OBJECTS)
 ## use MPI here
+## DK DK add OpenMP compiler flag here if needed
+#	${MPIFCCOMPILE_NO_CHECK} -qsmp=omp -o ${E}/xspecfem3D $(XSPECFEM_OBJECTS) $(MPILIBS)
 	${MPIFCCOMPILE_NO_CHECK} -o ${E}/xspecfem3D $(XSPECFEM_OBJECTS) $(MPILIBS)
 
 reqheader:
@@ -335,6 +337,8 @@
 	${FCCOMPILE_NO_CHECK} -c -o $O/compute_forces_crust_mantle.o ${FCFLAGS_f90} $S/compute_forces_crust_mantle.f90
 
 $O/compute_forces_crust_mantle_Dev.o: ${SETUP}/constants.h ${OUTPUT}/values_from_mesher.h $S/compute_forces_crust_mantle_Dev.F90
+## DK DK add OpenMP compiler flag here if needed
+#	${FCCOMPILE_NO_CHECK} -c -qsmp=omp -o $O/compute_forces_crust_mantle_Dev.o ${FCFLAGS_f90} $S/compute_forces_crust_mantle_Dev.F90
 	${FCCOMPILE_NO_CHECK} -c -o $O/compute_forces_crust_mantle_Dev.o ${FCFLAGS_f90} $S/compute_forces_crust_mantle_Dev.F90
 
 $O/compute_forces_outer_core.o: ${SETUP}/constants.h ${OUTPUT}/values_from_mesher.h $S/compute_forces_outer_core.f90

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_forces_crust_mantle_Dev.F90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_forces_crust_mantle_Dev.F90	2012-03-02 00:32:56 UTC (rev 19707)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_forces_crust_mantle_Dev.F90	2012-03-02 16:19:57 UTC (rev 19708)
@@ -244,7 +244,7 @@
   logical :: INCLUDE_CENTRAL_CUBE
 
 ! local to global mapping
-  integer NSPEC2D_BOTTOM_INNER_CORE
+  integer NSPEC2D_BOTTOM_INNER_CORE,iend,ispec_glob
   integer, dimension(NSPEC2D_BOTTOM_INNER_CORE) :: ibelm_bottom_inner_core
 
 #ifdef _HANDOPT
@@ -255,19 +255,41 @@
 !   big loop over all spectral elements in the solid
 ! ****************************************************
 
-  computed_elements = 0
+!$OMP PARALLEL DEFAULT(NONE) SHARED(xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
+!$OMP one_minus_sum_beta,epsilon_trace_over_3,c11store,c12store,c13store,c14store,c15store, &
+!$OMP c16store,c22store,c23store,c24store,c25store,c26store,c33store,c34store,c35store, &
+!$OMP c36store,c44store,c45store,c46store,c55store,c56store,c66store,ispec_is_tiso, &
+!$OMP kappavstore,muvstore,kappahstore,muhstore,eta_anisostore,ibool,ystore,zstore, &
+!$OMP R_memory,xstore,minus_gravity_table,minus_deriv_gravity_table,density_table, &
+!$OMP displ_crust_mantle,wgll_cube,accel_inner_core,hprime_xxt,hprime_xx,idoubling_inner_core, &
+!$OMP addressing,iboolleft_xi_crust_mantle,iboolright_xi_crust_mantle,vx,vy,vz,vnspec, &
+!$OMP iboolleft_eta_crust_mantle,iboolright_eta_crust_mantle,npoin2D_faces_crust_mantle, &
+!$OMP npoin2D_xi_crust_mantle,npoin2D_eta_crust_mantle,iboolfaces_crust_mantle,iboolcorner_crust_mantle,iboolleft_xi_inner_core,iboolright_xi_inner_core,ibool_inner_core, &
+!$OMP iboolleft_eta_inner_core,iboolright_eta_inner_core,npoin2D_faces_inner_core,npoin2D_xi_inner_core,npoin2D_eta_inner_core,iboolfaces_inner_core,accel_crust_mantle, &
+!$OMP iboolcorner_inner_core,iprocfrom_faces,iprocto_faces,iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners,buffer_send_faces,buffer_received_faces, &
+!$OMP buffer_send_chunkcorn_vector,buffer_recv_chunkcorn_vector, &
+!$OMP sender_from_slices_to_cube,buffer_all_cube_from_slices,buffer_slices,ibool_central_cube, &
+!$OMP ibelm_bottom_inner_core,hprimewgll_xx,hprimewgll_xxt,wgllwgll_xy,wgllwgll_xz, &
+!$OMP wgllwgll_yz,INCLUDE_CENTRAL_CUBE,alphaval,betaval,epsilondev,gammaval,factor_common, &
+!$OMP myrank,ichunk,iphase,iphase_CC,icall,iproc_xi,iproc_eta,is_on_a_slice_edge_crust_mantle, &
+!$OMP npoin2D_cube_from_slices,nb_msgs_theor_in_cube,receiver_cube_from_slices, &
+!$OMP npoin2D_max_all_CM_IC ) &
+!$OMP PRIVATE(k,j,ispec,fac1,fac2,fac3,sum_terms,iend,ispec_glob, &
+!$OMP C1_mxm_m2_m1_5points,A1_mxm_m2_m1_5points,B2_m1_m2_5points,C3_m1_m2_5points, &
+!$OMP B3_m1_m2_5points,C2_mxm_m2_m1_5points,E1_m1_m2_5points,E2_m1_m2_5points,A2_mxm_m2_m1_5points,C3_mxm_m2_m1_5points,A3_mxm_m2_m1_5points,B1_m1_m2_5points, &
+!$OMP C2_m1_m2_5points,C1_m1_m2_5points,E3_m1_m2_5points,E1_mxm_m2_m1_5points,E2_mxm_m2_m1_5points,E3_mxm_m2_m1_5points,tempx1,tempx2,tempx3, &
+!$OMP newtempx1,newtempx2,newtempx3,newtempy1,newtempy2,newtempy3,newtempz1,newtempz2,newtempz3,dummyx_loc,dummyy_loc,dummyz_loc,rho_s_H,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3, &
+!$OMP iglobv5,iglob1,epsilondev_loc)
 
-  do ispec = 1,NSPEC_CRUST_MANTLE
+  do ispec_glob = 1,NSPEC_CRUST_MANTLE,ELEMENTS_NONBLOCKING_CM_IC
 
-! hide communications by computing the edges first
-    if((icall == 2 .and. is_on_a_slice_edge_crust_mantle(ispec)) .or. &
-       (icall == 1 .and. .not. is_on_a_slice_edge_crust_mantle(ispec))) cycle
-
 ! process the communications every ELEMENTS_NONBLOCKING elements
-    computed_elements = computed_elements + 1
-    if (USE_NONBLOCKING_COMMS .and. icall == 2 .and. mod(computed_elements,ELEMENTS_NONBLOCKING_CM_IC) == 0) then
+    if (USE_NONBLOCKING_COMMS .and. icall == 2) then
 
-      if(iphase <= 7) call assemble_MPI_vector(myrank,accel_crust_mantle,accel_inner_core, &
+      if(iphase <= 7) then
+!$OMP BARRIER
+!$OMP MASTER
+            call assemble_MPI_vector(myrank,accel_crust_mantle,accel_inner_core, &
             iproc_xi,iproc_eta,ichunk,addressing, &
             iboolleft_xi_crust_mantle,iboolright_xi_crust_mantle,iboolleft_eta_crust_mantle,iboolright_eta_crust_mantle, &
             npoin2D_faces_crust_mantle,npoin2D_xi_crust_mantle,npoin2D_eta_crust_mantle, &
@@ -282,17 +304,34 @@
             NUMMSGS_FACES_VAL,NCORNERSCHUNKS_VAL, &
             NPROC_XI_VAL,NPROC_ETA_VAL,NGLOB1D_RADIAL_CM, &
             NGLOB1D_RADIAL_IC,NCHUNKS_VAL,iphase)
+!$OMP END MASTER
+!$OMP BARRIER
+      end if
 
       if(INCLUDE_CENTRAL_CUBE) then
-          if(iphase > 7 .and. iphase_CC <= 4) &
+          if(iphase > 7 .and. iphase_CC <= 4) then
+!$OMP BARRIER
+!$OMP MASTER
             call assemble_MPI_central_cube(ichunk,nb_msgs_theor_in_cube,sender_from_slices_to_cube, &
                    npoin2D_cube_from_slices,buffer_all_cube_from_slices,buffer_slices,ibool_central_cube, &
                    receiver_cube_from_slices,ibool_inner_core,idoubling_inner_core, &
                    ibelm_bottom_inner_core,NSPEC2D_BOTTOM_IC,accel_inner_core,NDIM,iphase_CC)
+!$OMP END MASTER
+!$OMP BARRIER
+          end if
       endif
 
     endif
 
+    iend = min(ispec_glob+ELEMENTS_NONBLOCKING_CM_IC-1,NSPEC_CRUST_MANTLE)
+
+!$OMP DO SCHEDULE(runtime)
+    do ispec=ispec_glob,iend
+
+! hide communications by computing the edges first
+    if((icall == 2 .and. is_on_a_slice_edge_crust_mantle(ispec)) .or. &
+       (icall == 1 .and. .not. is_on_a_slice_edge_crust_mantle(ispec))) cycle
+
     ! subroutines adapted from Deville, Fischer and Mund, High-order methods
     ! for incompressible fluid flow, Cambridge University Press (2002),
     ! pages 386 and 389 and Figure 8.3.1
@@ -593,8 +632,13 @@
     if(COMPUTE_AND_STORE_STRAIN) then
       epsilondev(:,:,:,:,ispec) = epsilondev_loc(:,:,:,:)
     endif
+! end ispec loop
+   end do
+!$OMP END DO
 
+! end ispec_globe strided loop
   enddo   ! spectral element loop NSPEC_CRUST_MANTLE
+!$OMP END PARALLEL
 
   end subroutine compute_forces_crust_mantle_Dev
 

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/specfem3D.F90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/specfem3D.F90	2012-03-02 00:32:56 UTC (rev 19707)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/specfem3D.F90	2012-03-02 16:19:57 UTC (rev 19708)
@@ -34,6 +34,9 @@
 !
 !#define _HANDOPT
 
+!! DK DK to turn OpenMP on
+!#define USE_OPENMP
+
 ! BEWARE:
 ! BEWARE: we have observed that using _HANDOPT in combination with -O3 or higher can lead to problems on some machines;
 ! BEWARE: thus, be careful when using it. At the very least, run the same test simulation once with _HANDOPT and once without
@@ -964,7 +967,16 @@
 !-------------------------------------------------------------------------------------------------
 !
   ! initialize the MPI communicator and start the NPROCTOT MPI processes.
+!! DK DK when turning OpenMP on, use this instead:
+!! DK DK from http://mpi.deino.net/mpi_functions/MPI_Init_thread.html
+!! DK DK MPI_THREAD_FUNNELED: the process may be multi-threaded, but only the main thread will make MPI calls
+!! DK DK (all MPI calls are funneled to the main thread). 
+#ifdef USE_OPENMP
+  integer :: iprovided
+  call MPI_INIT_THREAD(MPI_THREAD_FUNNELED,iprovided,ier)
+#else
   call MPI_INIT(ier)
+#endif
 
   ! force Flush-To-Zero if available to avoid very slow Gradual Underflow trapping
   call force_ftz()