[cig-commits] r12910 - in seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta: . setup src

Wed Sep 17 11:17:25 PDT 2008

Author: dkomati1
Date: 2008-09-17 11:17:25 -0700 (Wed, 17 Sep 2008)
New Revision: 12910

Added:
   seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/assemble_MPI_vector_block.F90
   seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/compute_forces_CM_IC.F90
   seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/compute_forces_OC.F90
   seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/debug_with_opendx.f90
   seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/fix_non_blocking_arrays.f90
Removed:
   seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/compute_forces_CM_IC.f90
   seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/compute_forces_OC.f90
Modified:
   seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/Makefile
   seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/setup/constants.h
   seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/assemble_MPI_scalar.F90
   seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/assemble_MPI_vector.F90
   seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/create_chunk_buffers.F90
   seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/create_regions_mesh.F90
   seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/exit_mpi.F90
   seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/main_program.F90
   seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/read_compute_parameters.F90
   seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/specfem3D.F90
Log:
non blocking communications in the solver to assemble slices inside each chunk;
this used to be implemented with (blocking) SENDRECV and therefore slow at very high resolution;
now I do it with MPI_ISEND, MPI_IRECV and then MPI_TEST to test if messages have arrived


Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/Makefile
===================================================================

--- seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/Makefile	2008-09-17 17:33:29 UTC (rev 12909)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/Makefile	2008-09-17 18:17:25 UTC (rev 12910)
@@ -115,11 +115,14 @@
 	$O/assemble_MPI_scalar.o \
 	$O/assemble_MPI_scalar_block.o \
 	$O/assemble_MPI_vector.o \
+	$O/assemble_MPI_vector_block.o \
 	$O/attenuation_model.o \
 	$O/calc_jacobian.o \
 	$O/convert_time.o \
 	$O/calendar.o \
 	$O/create_name_database.o \
+	$O/debug_with_opendx.o \
+	$O/fix_non_blocking_arrays.o \
 	$O/write_AVS_DX_surface_data.o \
 	$O/write_AVS_DX_global_chunks_data.o \
 	$O/write_AVS_DX_global_faces_data.o \
@@ -269,11 +272,11 @@
 $O/specfem3D.o: $(SPECINC)/constants.h $(OUTPUT_FILES_INC)/values_from_mesher.h $S/specfem3D.F90
 	${MPIFCCOMPILE_NO_CHECK} -c -o $O/specfem3D.o ${FCFLAGS_f90} $S/specfem3D.F90
 
-$O/compute_forces_CM_IC.o: $(SPECINC)/constants.h $(OUTPUT_FILES_INC)/values_from_mesher.h $S/compute_forces_CM_IC.f90
-	${FCCOMPILE_NO_CHECK} -c -o $O/compute_forces_CM_IC.o ${FCFLAGS_f90} $S/compute_forces_CM_IC.f90
+$O/compute_forces_CM_IC.o: $(SPECINC)/constants.h $(OUTPUT_FILES_INC)/values_from_mesher.h $S/compute_forces_CM_IC.F90
+	${MPIFCCOMPILE_NO_CHECK} -c -o $O/compute_forces_CM_IC.o ${FCFLAGS_f90} $S/compute_forces_CM_IC.F90
 
-$O/compute_forces_OC.o: $(SPECINC)/constants.h $(OUTPUT_FILES_INC)/values_from_mesher.h $S/compute_forces_OC.f90
-	${FCCOMPILE_NO_CHECK} -c -o $O/compute_forces_OC.o ${FCFLAGS_f90} $S/compute_forces_OC.f90
+$O/compute_forces_OC.o: $(SPECINC)/constants.h $(OUTPUT_FILES_INC)/values_from_mesher.h $S/compute_forces_OC.F90
+	${MPIFCCOMPILE_NO_CHECK} -c -o $O/compute_forces_OC.o ${FCFLAGS_f90} $S/compute_forces_OC.F90
 
 ### use MPI here
 $O/assemble_MPI_vector.o: $(SPECINC)/constants.h $S/assemble_MPI_vector.F90
@@ -286,6 +289,9 @@
 $O/assemble_MPI_scalar_block.o: $(SPECINC)/constants.h $S/assemble_MPI_scalar_block.F90
 	${MPIFCCOMPILE_NO_CHECK} -c -o $O/assemble_MPI_scalar_block.o ${FCFLAGS_f90} $S/assemble_MPI_scalar_block.F90
 
+$O/assemble_MPI_vector_block.o: $(SPECINC)/constants.h $S/assemble_MPI_vector_block.F90
+	${MPIFCCOMPILE_NO_CHECK} -c -o $O/assemble_MPI_vector_block.o ${FCFLAGS_f90} $S/assemble_MPI_vector_block.F90
+
 $O/assemble_MPI_central_cube.o: $(SPECINC)/constants.h $(OUTPUT_FILES_INC)/values_from_mesher.h $S/assemble_MPI_central_cube.F90
 	${MPIFCCOMPILE_NO_CHECK} -c -o $O/assemble_MPI_central_cube.o ${FCFLAGS_f90} $S/assemble_MPI_central_cube.F90
 
@@ -352,6 +358,12 @@
 $O/create_name_database.o: $(SPECINC)/constants.h $S/create_name_database.f90
 	${FCCOMPILE_CHECK} -c -o $O/create_name_database.o ${FCFLAGS_f90} $S/create_name_database.f90
 
+$O/debug_with_opendx.o: $(SPECINC)/constants.h $S/debug_with_opendx.f90
+	${FCCOMPILE_CHECK} -c -o $O/debug_with_opendx.o ${FCFLAGS_f90} $S/debug_with_opendx.f90
+
+$O/fix_non_blocking_arrays.o: $(SPECINC)/constants.h $S/fix_non_blocking_arrays.f90
+	${FCCOMPILE_CHECK} -c -o $O/fix_non_blocking_arrays.o ${FCFLAGS_f90} $S/fix_non_blocking_arrays.f90
+
 $O/write_AVS_DX_surface_data.o: $(SPECINC)/constants.h $S/write_AVS_DX_surface_data.f90
 	${FCCOMPILE_CHECK} -c -o $O/write_AVS_DX_surface_data.o ${FCFLAGS_f90} $S/write_AVS_DX_surface_data.f90
 

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/setup/constants.h
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/setup/constants.h	2008-09-17 17:33:29 UTC (rev 12909)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/setup/constants.h	2008-09-17 18:17:25 UTC (rev 12910)
@@ -31,6 +31,13 @@
 !--- user can modify parameters below
 !
 
+! this for non blocking assembly
+  logical, parameter :: USE_NONBLOCKING_COMMS = .true.
+  integer, parameter :: ELEMENTS_BETWEEN_NONBLOCKING = 200
+
+  logical, parameter :: DEBUG_NONBLOCKING_COMMS = .false.
+  logical, parameter :: DEBUG_USING_OPENDX = .false.
+
 !! DK DK temporary patch for the large Gordon Bell runs: set RECEIVERS_CAN_BE_BURIED
 !! DK DK to false in all cases etc
   logical, parameter :: PATCH_FOR_GORDON_BELL = .true.

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/assemble_MPI_scalar.F90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/assemble_MPI_scalar.F90	2008-09-17 17:33:29 UTC (rev 12909)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/assemble_MPI_scalar.F90	2008-09-17 18:17:25 UTC (rev 12910)
@@ -40,7 +40,7 @@
             buffer_send_chunkcorners_scalar,buffer_recv_chunkcorners_scalar, &
             NUMMSGS_FACES,NUM_MSG_TYPES,NCORNERSCHUNKS, &
             NPROC_XI,NPROC_ETA,NGLOB1D_RADIAL, &
-            NGLOB2DMAX_XMIN_XMAX,NGLOB2DMAX_YMIN_YMAX,NGLOB2DMAX_XY,NCHUNKS)
+            NGLOB2DMAX_XMIN_XMAX,NGLOB2DMAX_YMIN_YMAX,NGLOB2DMAX_XY,NCHUNKS,iphase)
 
   implicit none
 
@@ -54,7 +54,7 @@
   include "precision.h"
 #endif
 
-  integer myrank,nglob,NCHUNKS
+  integer myrank,nglob,NCHUNKS,iphase
 
 ! array to assemble
   real(kind=CUSTOM_REAL), dimension(nglob) :: array_val
@@ -105,6 +105,8 @@
 
 #ifdef USE_MPI
   integer :: ier
+  integer, save :: request_send,request_receive
+  logical :: flag_result_test
 #endif
 
 ! $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
@@ -122,8 +124,7 @@
 !---- first assemble along xi using the 2-D topology
 !----
 
-! assemble along xi only if more than one slice
-  if(NPROC_XI > 1) then
+  if(iphase == 1) then
 
 ! slices copy the right face into the buffer
   do ipoin=1,npoin2D_xi(2)
@@ -146,11 +147,40 @@
     receiver = addressing(ichunk,iproc_xi + 1,iproc_eta)
   endif
 #ifdef USE_MPI
-  call MPI_SENDRECV(buffer_send_faces_scalar,npoin2D_xi(2),CUSTOM_MPI_TYPE,receiver, &
-        itag2,buffer_received_faces_scalar,npoin2D_xi(1),CUSTOM_MPI_TYPE,sender, &
-        itag,MPI_COMM_WORLD,msg_status,ier)
+! call MPI_SENDRECV(buffer_send_faces_scalar,npoin2D_xi(2),CUSTOM_MPI_TYPE,receiver, &
+!       itag2,buffer_received_faces_scalar,npoin2D_xi(1),CUSTOM_MPI_TYPE,sender, &
+!       itag,MPI_COMM_WORLD,msg_status,ier)
+
+! call MPI_RECV(buffer_received_faces_scalar,npoin2D_xi(1),CUSTOM_MPI_TYPE,sender, &
+!       itag,MPI_COMM_WORLD,msg_status,ier)
+
+! call MPI_SEND(buffer_send_faces_scalar,npoin2D_xi(2),CUSTOM_MPI_TYPE,receiver, &
+!       itag2,MPI_COMM_WORLD,ier)
+
+  call MPI_IRECV(buffer_received_faces_scalar,npoin2D_xi(1),CUSTOM_MPI_TYPE,sender, &
+        itag,MPI_COMM_WORLD,request_receive,ier)
+
+  call MPI_ISEND(buffer_send_faces_scalar,npoin2D_xi(2),CUSTOM_MPI_TYPE,receiver, &
+        itag2,MPI_COMM_WORLD,request_send,ier)
+
 #endif
 
+  iphase = iphase + 1
+  return ! exit because we have started some communications therefore we need some time
+
+  endif !!!!!!!!! end of iphase 1
+
+  if(iphase == 2) then
+
+#ifdef USE_MPI
+! call MPI_WAIT(request_send,msg_status,ier)
+! call MPI_WAIT(request_receive,msg_status,ier)
+  call MPI_TEST(request_send,flag_result_test,msg_status,ier)
+  if(.not. flag_result_test) return ! exit if message not sent yet
+  call MPI_TEST(request_receive,flag_result_test,msg_status,ier)
+  if(.not. flag_result_test) return ! exit if message not received yet
+#endif
+
 ! all slices add the buffer received to the contributions on the left face
   if(iproc_xi > 0) then
   do ipoin=1,npoin2D_xi(1)
@@ -182,11 +212,40 @@
     receiver = addressing(ichunk,iproc_xi - 1,iproc_eta)
   endif
 #ifdef USE_MPI
-  call MPI_SENDRECV(buffer_send_faces_scalar,npoin2D_xi(1),CUSTOM_MPI_TYPE,receiver, &
-        itag2,buffer_received_faces_scalar,npoin2D_xi(2),CUSTOM_MPI_TYPE,sender, &
-        itag,MPI_COMM_WORLD,msg_status,ier)
+! call MPI_SENDRECV(buffer_send_faces_scalar,npoin2D_xi(1),CUSTOM_MPI_TYPE,receiver, &
+!       itag2,buffer_received_faces_scalar,npoin2D_xi(2),CUSTOM_MPI_TYPE,sender, &
+!       itag,MPI_COMM_WORLD,msg_status,ier)
+
+! call MPI_RECV(buffer_received_faces_scalar,npoin2D_xi(2),CUSTOM_MPI_TYPE,sender, &
+!       itag,MPI_COMM_WORLD,msg_status,ier)
+
+! call MPI_SEND(buffer_send_faces_scalar,npoin2D_xi(1),CUSTOM_MPI_TYPE,receiver, &
+!       itag2,MPI_COMM_WORLD,ier)
+
+  call MPI_IRECV(buffer_received_faces_scalar,npoin2D_xi(2),CUSTOM_MPI_TYPE,sender, &
+        itag,MPI_COMM_WORLD,request_receive,ier)
+
+  call MPI_ISEND(buffer_send_faces_scalar,npoin2D_xi(1),CUSTOM_MPI_TYPE,receiver, &
+        itag2,MPI_COMM_WORLD,request_send,ier)
+
 #endif
 
+  iphase = iphase + 1
+  return ! exit because we have started some communications therefore we need some time
+
+  endif !!!!!!!!! end of iphase 2
+
+  if(iphase == 3) then
+
+#ifdef USE_MPI
+! call MPI_WAIT(request_send,msg_status,ier)
+! call MPI_WAIT(request_receive,msg_status,ier)
+  call MPI_TEST(request_send,flag_result_test,msg_status,ier)
+  if(.not. flag_result_test) return ! exit if message not sent yet
+  call MPI_TEST(request_receive,flag_result_test,msg_status,ier)
+  if(.not. flag_result_test) return ! exit if message not received yet
+#endif
+
 ! all slices copy the buffer received to the contributions on the right face
   if(iproc_xi < NPROC_XI-1) then
   do ipoin=1,npoin2D_xi(2)
@@ -194,15 +253,10 @@
   enddo
   endif
 
-  endif
-
 !----
 !---- then assemble along eta using the 2-D topology
 !----
 
-! assemble along eta only if more than one slice
-  if(NPROC_ETA > 1) then
-
 ! slices copy the right face into the buffer
   do ipoin=1,npoin2D_eta(2)
     buffer_send_faces_scalar(ipoin) = array_val(iboolright_eta(ipoin))
@@ -224,11 +278,40 @@
     receiver = addressing(ichunk,iproc_xi,iproc_eta + 1)
   endif
 #ifdef USE_MPI
-  call MPI_SENDRECV(buffer_send_faces_scalar,npoin2D_eta(2),CUSTOM_MPI_TYPE,receiver, &
-    itag2,buffer_received_faces_scalar,npoin2D_eta(1),CUSTOM_MPI_TYPE,sender, &
-    itag,MPI_COMM_WORLD,msg_status,ier)
+! call MPI_SENDRECV(buffer_send_faces_scalar,npoin2D_eta(2),CUSTOM_MPI_TYPE,receiver, &
+!   itag2,buffer_received_faces_scalar,npoin2D_eta(1),CUSTOM_MPI_TYPE,sender, &
+!   itag,MPI_COMM_WORLD,msg_status,ier)
+
+! call MPI_RECV(buffer_received_faces_scalar,npoin2D_eta(1),CUSTOM_MPI_TYPE,sender, &
+!   itag,MPI_COMM_WORLD,msg_status,ier)
+
+! call MPI_SEND(buffer_send_faces_scalar,npoin2D_eta(2),CUSTOM_MPI_TYPE,receiver, &
+!   itag2,MPI_COMM_WORLD,ier)
+
+  call MPI_IRECV(buffer_received_faces_scalar,npoin2D_eta(1),CUSTOM_MPI_TYPE,sender, &
+    itag,MPI_COMM_WORLD,request_receive,ier)
+
+  call MPI_ISEND(buffer_send_faces_scalar,npoin2D_eta(2),CUSTOM_MPI_TYPE,receiver, &
+    itag2,MPI_COMM_WORLD,request_send,ier)
+
 #endif
 
+  iphase = iphase + 1
+  return ! exit because we have started some communications therefore we need some time
+
+  endif !!!!!!!!! end of iphase 3
+
+  if(iphase == 4) then
+
+#ifdef USE_MPI
+! call MPI_WAIT(request_send,msg_status,ier)
+! call MPI_WAIT(request_receive,msg_status,ier)
+  call MPI_TEST(request_send,flag_result_test,msg_status,ier)
+  if(.not. flag_result_test) return ! exit if message not sent yet
+  call MPI_TEST(request_receive,flag_result_test,msg_status,ier)
+  if(.not. flag_result_test) return ! exit if message not received yet
+#endif
+
 ! all slices add the buffer received to the contributions on the left face
   if(iproc_eta > 0) then
   do ipoin=1,npoin2D_eta(1)
@@ -260,11 +343,40 @@
     receiver = addressing(ichunk,iproc_xi,iproc_eta - 1)
   endif
 #ifdef USE_MPI
-  call MPI_SENDRECV(buffer_send_faces_scalar,npoin2D_eta(1),CUSTOM_MPI_TYPE,receiver, &
-    itag2,buffer_received_faces_scalar,npoin2D_eta(2),CUSTOM_MPI_TYPE,sender, &
-    itag,MPI_COMM_WORLD,msg_status,ier)
+! call MPI_SENDRECV(buffer_send_faces_scalar,npoin2D_eta(1),CUSTOM_MPI_TYPE,receiver, &
+!   itag2,buffer_received_faces_scalar,npoin2D_eta(2),CUSTOM_MPI_TYPE,sender, &
+!   itag,MPI_COMM_WORLD,msg_status,ier)
+
+! call MPI_RECV(buffer_received_faces_scalar,npoin2D_eta(2),CUSTOM_MPI_TYPE,sender, &
+!   itag,MPI_COMM_WORLD,msg_status,ier)
+
+! call MPI_SEND(buffer_send_faces_scalar,npoin2D_eta(1),CUSTOM_MPI_TYPE,receiver, &
+!   itag2,MPI_COMM_WORLD,ier)
+
+  call MPI_IRECV(buffer_received_faces_scalar,npoin2D_eta(2),CUSTOM_MPI_TYPE,sender, &
+    itag,MPI_COMM_WORLD,request_receive,ier)
+
+  call MPI_ISEND(buffer_send_faces_scalar,npoin2D_eta(1),CUSTOM_MPI_TYPE,receiver, &
+    itag2,MPI_COMM_WORLD,request_send,ier)
+
 #endif
 
+  iphase = iphase + 1
+  return ! exit because we have started some communications therefore we need some time
+
+  endif !!!!!!!!! end of iphase 4
+
+  if(iphase == 5) then
+
+#ifdef USE_MPI
+! call MPI_WAIT(request_send,msg_status,ier)
+! call MPI_WAIT(request_receive,msg_status,ier)
+  call MPI_TEST(request_send,flag_result_test,msg_status,ier)
+  if(.not. flag_result_test) return ! exit if message not sent yet
+  call MPI_TEST(request_receive,flag_result_test,msg_status,ier)
+  if(.not. flag_result_test) return ! exit if message not received yet
+#endif
+
 ! all slices copy the buffer received to the contributions on the right face
   if(iproc_eta < NPROC_ETA-1) then
   do ipoin=1,npoin2D_eta(2)
@@ -272,15 +384,20 @@
   enddo
   endif
 
-  endif
+  iphase = iphase + 1
 
+!! DK DK do the rest in blocking for now, for simplicity
+
 !----
 !---- start MPI assembling phase between chunks
 !----
 
 ! check flag to see if we need to assemble (might be turned off when debugging)
 ! and do not assemble if only one chunk
-  if (.not. ACTUALLY_ASSEMBLE_MPI_CHUNKS .or. NCHUNKS == 1) return
+  if (.not. ACTUALLY_ASSEMBLE_MPI_CHUNKS .or. NCHUNKS == 1) then
+    iphase = 9999 ! this means everything is finished
+    return
+  endif
 
 ! ***************************************************************
 !  transmit messages in forward direction (iprocfrom -> iprocto)
@@ -487,5 +604,7 @@
 
   enddo
 
+  endif !!!!!!!!! end of iphase 5
+
   end subroutine assemble_MPI_scalar
 

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/assemble_MPI_vector.F90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/assemble_MPI_vector.F90	2008-09-17 17:33:29 UTC (rev 12909)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/assemble_MPI_vector.F90	2008-09-17 18:17:25 UTC (rev 12910)
@@ -45,7 +45,7 @@
             buffer_send_chunkcorners_vector,buffer_recv_chunkcorners_vector, &
             NUMMSGS_FACES,NUM_MSG_TYPES,NCORNERSCHUNKS, &
             NPROC_XI,NPROC_ETA,NGLOB1D_RADIAL_crust_mantle, &
-            NGLOB1D_RADIAL_inner_core,NCHUNKS)
+            NGLOB1D_RADIAL_inner_core,NCHUNKS,iphase)
 
   implicit none
 
@@ -62,7 +62,7 @@
 ! include values created by the mesher
   include "values_from_mesher.h"
 
-  integer myrank,NCHUNKS
+  integer myrank,NCHUNKS,iphase
 
 ! the two arrays to assemble
   real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_CRUST_MANTLE) :: accel_crust_mantle
@@ -119,10 +119,14 @@
   integer :: imsg,imsg_loop
   integer :: icount_faces,npoin2D_chunks_all
 
-  integer :: npoin2D_xi_all,npoin2D_eta_all,NGLOB1D_RADIAL_all,ioffset
+  integer :: npoin2D_xi_all,npoin2D_eta_all,NGLOB1D_RADIAL_all
 
+  integer, save :: ioffset
+
 #ifdef USE_MPI
   integer :: ier
+  integer, save :: request_send,request_receive
+  logical :: flag_result_test
 #endif
 
 ! $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
@@ -144,8 +148,7 @@
 !---- first assemble along xi using the 2-D topology
 !----
 
-! assemble along xi only if more than one slice
-  if(NPROC_XI > 1) then
+  if(iphase == 1) then
 
 ! the buffer for the inner core starts right after the buffer for the crust and mantle
   ioffset = npoin2D_xi_crust_mantle
@@ -179,11 +182,40 @@
     receiver = addressing(ichunk,iproc_xi + 1,iproc_eta)
   endif
 #ifdef USE_MPI
-  call MPI_SENDRECV(buffer_send_faces_vector,NDIM*npoin2D_xi_all,CUSTOM_MPI_TYPE,receiver, &
-        itag2,buffer_received_faces_vector,NDIM*npoin2D_xi_all,CUSTOM_MPI_TYPE,sender, &
-        itag,MPI_COMM_WORLD,msg_status,ier)
+! call MPI_SENDRECV(buffer_send_faces_vector,NDIM*npoin2D_xi_all,CUSTOM_MPI_TYPE,receiver, &
+!       itag2,buffer_received_faces_vector,NDIM*npoin2D_xi_all,CUSTOM_MPI_TYPE,sender, &
+!       itag,MPI_COMM_WORLD,msg_status,ier)
+
+! call MPI_RECV(buffer_received_faces_vector,NDIM*npoin2D_xi_all,CUSTOM_MPI_TYPE,sender, &
+!       itag,MPI_COMM_WORLD,msg_status,ier)
+
+! call MPI_SEND(buffer_send_faces_vector,NDIM*npoin2D_xi_all,CUSTOM_MPI_TYPE,receiver, &
+!       itag2,MPI_COMM_WORLD,ier)
+
+  call MPI_IRECV(buffer_received_faces_vector,NDIM*npoin2D_xi_all,CUSTOM_MPI_TYPE,sender, &
+        itag,MPI_COMM_WORLD,request_receive,ier)
+
+  call MPI_ISEND(buffer_send_faces_vector,NDIM*npoin2D_xi_all,CUSTOM_MPI_TYPE,receiver, &
+        itag2,MPI_COMM_WORLD,request_send,ier)
+
 #endif
 
+  iphase = iphase + 1
+  return ! exit because we have started some communications therefore we need some time
+
+  endif !!!!!!!!! end of iphase 1
+
+  if(iphase == 2) then
+
+#ifdef USE_MPI
+! call MPI_WAIT(request_send,msg_status,ier)
+! call MPI_WAIT(request_receive,msg_status,ier)
+  call MPI_TEST(request_send,flag_result_test,msg_status,ier)
+  if(.not. flag_result_test) return ! exit if message not sent yet
+  call MPI_TEST(request_receive,flag_result_test,msg_status,ier)
+  if(.not. flag_result_test) return ! exit if message not received yet
+#endif
+
 ! all slices add the buffer received to the contributions on the left face
   if(iproc_xi > 0) then
 
@@ -238,11 +270,40 @@
     receiver = addressing(ichunk,iproc_xi - 1,iproc_eta)
   endif
 #ifdef USE_MPI
-  call MPI_SENDRECV(buffer_send_faces_vector,NDIM*npoin2D_xi_all,CUSTOM_MPI_TYPE,receiver, &
-        itag2,buffer_received_faces_vector,NDIM*npoin2D_xi_all,CUSTOM_MPI_TYPE,sender, &
-        itag,MPI_COMM_WORLD,msg_status,ier)
+! call MPI_SENDRECV(buffer_send_faces_vector,NDIM*npoin2D_xi_all,CUSTOM_MPI_TYPE,receiver, &
+!       itag2,buffer_received_faces_vector,NDIM*npoin2D_xi_all,CUSTOM_MPI_TYPE,sender, &
+!       itag,MPI_COMM_WORLD,msg_status,ier)
+
+! call MPI_RECV(buffer_received_faces_vector,NDIM*npoin2D_xi_all,CUSTOM_MPI_TYPE,sender, &
+!       itag,MPI_COMM_WORLD,msg_status,ier)
+
+! call MPI_SEND(buffer_send_faces_vector,NDIM*npoin2D_xi_all,CUSTOM_MPI_TYPE,receiver, &
+!       itag2,MPI_COMM_WORLD,ier)
+
+  call MPI_IRECV(buffer_received_faces_vector,NDIM*npoin2D_xi_all,CUSTOM_MPI_TYPE,sender, &
+        itag,MPI_COMM_WORLD,request_receive,ier)
+
+  call MPI_ISEND(buffer_send_faces_vector,NDIM*npoin2D_xi_all,CUSTOM_MPI_TYPE,receiver, &
+        itag2,MPI_COMM_WORLD,request_send,ier)
+
 #endif
 
+  iphase = iphase + 1
+  return ! exit because we have started some communications therefore we need some time
+
+  endif !!!!!!!!! end of iphase 2
+
+  if(iphase == 3) then
+
+#ifdef USE_MPI
+! call MPI_WAIT(request_send,msg_status,ier)
+! call MPI_WAIT(request_receive,msg_status,ier)
+  call MPI_TEST(request_send,flag_result_test,msg_status,ier)
+  if(.not. flag_result_test) return ! exit if message not sent yet
+  call MPI_TEST(request_receive,flag_result_test,msg_status,ier)
+  if(.not. flag_result_test) return ! exit if message not received yet
+#endif
+
 ! all slices copy the buffer received to the contributions on the right face
   if(iproc_xi < NPROC_XI-1) then
 
@@ -260,15 +321,10 @@
 
   endif
 
-  endif
-
 !----
 !---- then assemble along eta using the 2-D topology
 !----
 
-! assemble along eta only if more than one slice
-  if(NPROC_ETA > 1) then
-
 ! the buffer for the inner core starts right after the buffer for the crust and mantle
   ioffset = npoin2D_eta_crust_mantle
 
@@ -301,11 +357,40 @@
     receiver = addressing(ichunk,iproc_xi,iproc_eta + 1)
   endif
 #ifdef USE_MPI
-  call MPI_SENDRECV(buffer_send_faces_vector,NDIM*npoin2D_eta_all,CUSTOM_MPI_TYPE,receiver, &
-    itag2,buffer_received_faces_vector,NDIM*npoin2D_eta_all,CUSTOM_MPI_TYPE,sender, &
-    itag,MPI_COMM_WORLD,msg_status,ier)
+! call MPI_SENDRECV(buffer_send_faces_vector,NDIM*npoin2D_eta_all,CUSTOM_MPI_TYPE,receiver, &
+!   itag2,buffer_received_faces_vector,NDIM*npoin2D_eta_all,CUSTOM_MPI_TYPE,sender, &
+!   itag,MPI_COMM_WORLD,msg_status,ier)
+
+! call MPI_RECV(buffer_received_faces_vector,NDIM*npoin2D_eta_all,CUSTOM_MPI_TYPE,sender, &
+!   itag,MPI_COMM_WORLD,msg_status,ier)
+
+! call MPI_SEND(buffer_send_faces_vector,NDIM*npoin2D_eta_all,CUSTOM_MPI_TYPE,receiver, &
+!   itag2,MPI_COMM_WORLD,ier)
+
+  call MPI_IRECV(buffer_received_faces_vector,NDIM*npoin2D_eta_all,CUSTOM_MPI_TYPE,sender, &
+    itag,MPI_COMM_WORLD,request_receive,ier)
+
+  call MPI_ISEND(buffer_send_faces_vector,NDIM*npoin2D_eta_all,CUSTOM_MPI_TYPE,receiver, &
+    itag2,MPI_COMM_WORLD,request_send,ier)
+
 #endif
 
+  iphase = iphase + 1
+  return ! exit because we have started some communications therefore we need some time
+
+  endif !!!!!!!!! end of iphase 3
+
+  if(iphase == 4) then
+
+#ifdef USE_MPI
+! call MPI_WAIT(request_send,msg_status,ier)
+! call MPI_WAIT(request_receive,msg_status,ier)
+  call MPI_TEST(request_send,flag_result_test,msg_status,ier)
+  if(.not. flag_result_test) return ! exit if message not sent yet
+  call MPI_TEST(request_receive,flag_result_test,msg_status,ier)
+  if(.not. flag_result_test) return ! exit if message not received yet
+#endif
+
 ! all slices add the buffer received to the contributions on the left face
   if(iproc_eta > 0) then
 
@@ -360,11 +445,40 @@
     receiver = addressing(ichunk,iproc_xi,iproc_eta - 1)
   endif
 #ifdef USE_MPI
-  call MPI_SENDRECV(buffer_send_faces_vector,NDIM*npoin2D_eta_all,CUSTOM_MPI_TYPE,receiver, &
-    itag2,buffer_received_faces_vector,NDIM*npoin2D_eta_all,CUSTOM_MPI_TYPE,sender, &
-    itag,MPI_COMM_WORLD,msg_status,ier)
+! call MPI_SENDRECV(buffer_send_faces_vector,NDIM*npoin2D_eta_all,CUSTOM_MPI_TYPE,receiver, &
+!   itag2,buffer_received_faces_vector,NDIM*npoin2D_eta_all,CUSTOM_MPI_TYPE,sender, &
+!   itag,MPI_COMM_WORLD,msg_status,ier)
+
+! call MPI_RECV(buffer_received_faces_vector,NDIM*npoin2D_eta_all,CUSTOM_MPI_TYPE,sender, &
+!   itag,MPI_COMM_WORLD,msg_status,ier)
+
+! call MPI_SEND(buffer_send_faces_vector,NDIM*npoin2D_eta_all,CUSTOM_MPI_TYPE,receiver, &
+!   itag2,MPI_COMM_WORLD,ier)
+
+  call MPI_IRECV(buffer_received_faces_vector,NDIM*npoin2D_eta_all,CUSTOM_MPI_TYPE,sender, &
+    itag,MPI_COMM_WORLD,request_receive,ier)
+
+  call MPI_ISEND(buffer_send_faces_vector,NDIM*npoin2D_eta_all,CUSTOM_MPI_TYPE,receiver, &
+    itag2,MPI_COMM_WORLD,request_send,ier)
+
 #endif
 
+  iphase = iphase + 1
+  return ! exit because we have started some communications therefore we need some time
+
+  endif !!!!!!!!! end of iphase 4
+
+  if(iphase == 5) then
+
+#ifdef USE_MPI
+! call MPI_WAIT(request_send,msg_status,ier)
+! call MPI_WAIT(request_receive,msg_status,ier)
+  call MPI_TEST(request_send,flag_result_test,msg_status,ier)
+  if(.not. flag_result_test) return ! exit if message not sent yet
+  call MPI_TEST(request_receive,flag_result_test,msg_status,ier)
+  if(.not. flag_result_test) return ! exit if message not received yet
+#endif
+
 ! all slices copy the buffer received to the contributions on the right face
   if(iproc_eta < NPROC_ETA-1) then
 
@@ -382,15 +496,20 @@
 
   endif
 
-  endif
+  iphase = iphase + 1
 
+!! DK DK do the rest in blocking for now, for simplicity
+
 !----
 !---- start MPI assembling phase between chunks
 !----
 
 ! check flag to see if we need to assemble (might be turned off when debugging)
 ! and do not assemble if only one chunk
-  if (.not. ACTUALLY_ASSEMBLE_MPI_CHUNKS .or. NCHUNKS == 1) return
+  if (.not. ACTUALLY_ASSEMBLE_MPI_CHUNKS .or. NCHUNKS == 1) then
+    iphase = 9999 ! this means everything is finished
+    return
+  endif
 
 ! ***************************************************************
 !  transmit messages in forward direction (iprocfrom -> iprocto)
@@ -739,5 +858,7 @@
 
   enddo
 
+  endif !!!!!!!!! end of iphase 5
+
   end subroutine assemble_MPI_vector
 

Added: seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/assemble_MPI_vector_block.F90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/assemble_MPI_vector_block.F90	                        (rev 0)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/assemble_MPI_vector_block.F90	2008-09-17 18:17:25 UTC (rev 12910)
@@ -0,0 +1,743 @@
+!=====================================================================
+!
+!          S p e c f e m 3 D  G l o b e  V e r s i o n  4 . 1
+!          --------------------------------------------------
+!
+!          Main authors: Dimitri Komatitsch and Jeroen Tromp
+!    Seismological Laboratory, California Institute of Technology, USA
+!             and University of Pau / CNRS / INRIA, France
+! (c) California Institute of Technology and University of Pau / CNRS / INRIA
+!                            August 2008
+!
+! This program is free software; you can redistribute it and/or modify
+! it under the terms of the GNU General Public License as published by
+! the Free Software Foundation; either version 2 of the License, or
+! (at your option) any later version.
+!
+! This program is distributed in the hope that it will be useful,
+! but WITHOUT ANY WARRANTY; without even the implied warranty of
+! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+! GNU General Public License for more details.
+!
+! You should have received a copy of the GNU General Public License along
+! with this program; if not, write to the Free Software Foundation, Inc.,
+! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+!
+!=====================================================================
+
+!----
+!---- assemble the contributions between slices and chunks using MPI
+!---- we handle two regions (crust/mantle and inner core) in the same MPI call
+!---- to reduce the total number of MPI calls
+!----
+
+  subroutine assemble_MPI_vector_block(myrank,accel_crust_mantle,accel_inner_core, &
+            iproc_xi,iproc_eta,ichunk,addressing, &
+            iboolleft_xi_crust_mantle,iboolright_xi_crust_mantle,iboolleft_eta_crust_mantle,iboolright_eta_crust_mantle, &
+            npoin2D_faces_crust_mantle,npoin2D_xi_crust_mantle,npoin2D_eta_crust_mantle, &
+            iboolfaces_crust_mantle,iboolcorner_crust_mantle, &
+            iboolleft_xi_inner_core,iboolright_xi_inner_core,iboolleft_eta_inner_core,iboolright_eta_inner_core, &
+            npoin2D_faces_inner_core,npoin2D_xi_inner_core,npoin2D_eta_inner_core, &
+            iboolfaces_inner_core,iboolcorner_inner_core, &
+            iprocfrom_faces,iprocto_faces,imsg_type, &
+            iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
+            buffer_send_faces_vector,buffer_received_faces_vector,npoin2D_max_all, &
+            buffer_send_chunkcorners_vector,buffer_recv_chunkcorners_vector, &
+            NUMMSGS_FACES,NUM_MSG_TYPES,NCORNERSCHUNKS, &
+            NPROC_XI,NPROC_ETA,NGLOB1D_RADIAL_crust_mantle, &
+            NGLOB1D_RADIAL_inner_core,NCHUNKS)
+
+  implicit none
+
+! standard include of the MPI library
+#ifdef USE_MPI
+  include 'mpif.h'
+#endif
+
+  include "constants.h"
+#ifdef USE_MPI
+  include "precision.h"
+#endif
+
+! include values created by the mesher
+  include "values_from_mesher.h"
+
+  integer myrank,NCHUNKS
+
+! the two arrays to assemble
+  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_CRUST_MANTLE) :: accel_crust_mantle
+  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_INNER_CORE) :: accel_inner_core
+
+  integer iproc_xi,iproc_eta,ichunk
+  integer npoin2D_xi_crust_mantle,npoin2D_eta_crust_mantle
+  integer npoin2D_faces_crust_mantle(NUMFACES_SHARED)
+  integer npoin2D_xi_inner_core,npoin2D_eta_inner_core
+  integer npoin2D_faces_inner_core(NUMFACES_SHARED)
+
+  integer NGLOB1D_RADIAL_crust_mantle,NGLOB1D_RADIAL_inner_core,NPROC_XI,NPROC_ETA
+  integer NUMMSGS_FACES,NUM_MSG_TYPES,NCORNERSCHUNKS
+
+! for addressing of the slices
+  integer, dimension(NCHUNKS,0:NPROC_XI-1,0:NPROC_ETA-1) :: addressing
+
+! 2-D addressing and buffers for summation between slices
+  integer, dimension(NGLOB2DMAX_XMIN_XMAX_CM) :: iboolleft_xi_crust_mantle,iboolright_xi_crust_mantle
+  integer, dimension(NGLOB2DMAX_YMIN_YMAX_CM) :: iboolleft_eta_crust_mantle,iboolright_eta_crust_mantle
+  integer, dimension(NGLOB2DMAX_XMIN_XMAX_IC) :: iboolleft_xi_inner_core,iboolright_xi_inner_core
+  integer, dimension(NGLOB2DMAX_YMIN_YMAX_IC) :: iboolleft_eta_inner_core,iboolright_eta_inner_core
+
+! indirect addressing for each corner of the chunks
+  integer, dimension(NGLOB1D_RADIAL_crust_mantle,NUMCORNERS_SHARED) :: iboolcorner_crust_mantle
+  integer, dimension(NGLOB1D_RADIAL_inner_core,NUMCORNERS_SHARED) :: iboolcorner_inner_core
+  integer icount_corners
+
+  integer :: npoin2D_max_all
+  integer, dimension(NGLOB2DMAX_XY_VAL_CM,NUMFACES_SHARED) :: iboolfaces_crust_mantle
+  integer, dimension(NGLOB2DMAX_XY_VAL_IC,NUMFACES_SHARED) :: iboolfaces_inner_core
+  real(kind=CUSTOM_REAL), dimension(NDIM,npoin2D_max_all) :: buffer_send_faces_vector,buffer_received_faces_vector
+
+! buffers for send and receive between corners of the chunks
+! size of buffers is the sum of two sizes because we handle two regions in the same MPI call
+  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB1D_RADIAL_crust_mantle + NGLOB1D_RADIAL_inner_core) :: &
+    buffer_send_chunkcorners_vector,buffer_recv_chunkcorners_vector
+
+! ---- arrays to assemble between chunks
+
+! communication pattern for faces between chunks
+  integer, dimension(NUMMSGS_FACES) :: iprocfrom_faces,iprocto_faces,imsg_type
+
+! communication pattern for corners between chunks
+  integer, dimension(NCORNERSCHUNKS) :: iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners
+
+! MPI status of messages to be received
+#ifdef USE_MPI
+  integer, dimension(MPI_STATUS_SIZE) :: msg_status
+#endif
+
+  integer :: ipoin,ipoin2D,ipoin1D
+  integer :: sender,receiver
+  integer :: imsg,imsg_loop
+  integer :: icount_faces,npoin2D_chunks_all
+
+  integer :: npoin2D_xi_all,npoin2D_eta_all,NGLOB1D_RADIAL_all,ioffset
+
+#ifdef USE_MPI
+  integer :: ier
+#endif
+
+! $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
+
+! check flag to see if we need to assemble (might be turned off when debugging)
+  if (.not. ACTUALLY_ASSEMBLE_MPI_SLICES) return
+
+! here we have to assemble all the contributions between slices using MPI
+
+! size of buffers is the sum of two sizes because we handle two regions in the same MPI call
+  npoin2D_xi_all = npoin2D_xi_crust_mantle + npoin2D_xi_inner_core
+  npoin2D_eta_all = npoin2D_eta_crust_mantle + npoin2D_eta_inner_core
+
+!----
+!---- assemble the contributions between slices using MPI
+!----
+
+!----
+!---- first assemble along xi using the 2-D topology
+!----
+
+! assemble along xi only if more than one slice
+  if(NPROC_XI > 1) then
+
+! the buffer for the inner core starts right after the buffer for the crust and mantle
+  ioffset = npoin2D_xi_crust_mantle
+
+! slices copy the right face into the buffer
+  do ipoin = 1,npoin2D_xi_crust_mantle
+    buffer_send_faces_vector(1,ipoin) = accel_crust_mantle(1,iboolright_xi_crust_mantle(ipoin))
+    buffer_send_faces_vector(2,ipoin) = accel_crust_mantle(2,iboolright_xi_crust_mantle(ipoin))
+    buffer_send_faces_vector(3,ipoin) = accel_crust_mantle(3,iboolright_xi_crust_mantle(ipoin))
+  enddo
+
+  do ipoin = 1,npoin2D_xi_inner_core
+    buffer_send_faces_vector(1,ioffset + ipoin) = accel_inner_core(1,iboolright_xi_inner_core(ipoin))
+    buffer_send_faces_vector(2,ioffset + ipoin) = accel_inner_core(2,iboolright_xi_inner_core(ipoin))
+    buffer_send_faces_vector(3,ioffset + ipoin) = accel_inner_core(3,iboolright_xi_inner_core(ipoin))
+  enddo
+
+! send messages forward along each row
+  if(iproc_xi == 0) then
+#ifdef USE_MPI
+    sender = MPI_PROC_NULL
+#endif
+  else
+    sender = addressing(ichunk,iproc_xi - 1,iproc_eta)
+  endif
+  if(iproc_xi == NPROC_XI-1) then
+#ifdef USE_MPI
+    receiver = MPI_PROC_NULL
+#endif
+  else
+    receiver = addressing(ichunk,iproc_xi + 1,iproc_eta)
+  endif
+#ifdef USE_MPI
+  call MPI_SENDRECV(buffer_send_faces_vector,NDIM*npoin2D_xi_all,CUSTOM_MPI_TYPE,receiver, &
+        itag2,buffer_received_faces_vector,NDIM*npoin2D_xi_all,CUSTOM_MPI_TYPE,sender, &
+        itag,MPI_COMM_WORLD,msg_status,ier)
+#endif
+
+! all slices add the buffer received to the contributions on the left face
+  if(iproc_xi > 0) then
+
+  do ipoin = 1,npoin2D_xi_crust_mantle
+    accel_crust_mantle(1,iboolleft_xi_crust_mantle(ipoin)) = accel_crust_mantle(1,iboolleft_xi_crust_mantle(ipoin)) + &
+                              buffer_received_faces_vector(1,ipoin)
+    accel_crust_mantle(2,iboolleft_xi_crust_mantle(ipoin)) = accel_crust_mantle(2,iboolleft_xi_crust_mantle(ipoin)) + &
+                              buffer_received_faces_vector(2,ipoin)
+    accel_crust_mantle(3,iboolleft_xi_crust_mantle(ipoin)) = accel_crust_mantle(3,iboolleft_xi_crust_mantle(ipoin)) + &
+                              buffer_received_faces_vector(3,ipoin)
+  enddo
+
+  do ipoin = 1,npoin2D_xi_inner_core
+    accel_inner_core(1,iboolleft_xi_inner_core(ipoin)) = accel_inner_core(1,iboolleft_xi_inner_core(ipoin)) + &
+                              buffer_received_faces_vector(1,ioffset + ipoin)
+    accel_inner_core(2,iboolleft_xi_inner_core(ipoin)) = accel_inner_core(2,iboolleft_xi_inner_core(ipoin)) + &
+                              buffer_received_faces_vector(2,ioffset + ipoin)
+    accel_inner_core(3,iboolleft_xi_inner_core(ipoin)) = accel_inner_core(3,iboolleft_xi_inner_core(ipoin)) + &
+                              buffer_received_faces_vector(3,ioffset + ipoin)
+  enddo
+
+  endif
+
+! the contributions are correctly assembled on the left side of each slice
+! now we have to send the result back to the sender
+! all slices copy the left face into the buffer
+  do ipoin = 1,npoin2D_xi_crust_mantle
+    buffer_send_faces_vector(1,ipoin) = accel_crust_mantle(1,iboolleft_xi_crust_mantle(ipoin))
+    buffer_send_faces_vector(2,ipoin) = accel_crust_mantle(2,iboolleft_xi_crust_mantle(ipoin))
+    buffer_send_faces_vector(3,ipoin) = accel_crust_mantle(3,iboolleft_xi_crust_mantle(ipoin))
+  enddo
+
+  do ipoin = 1,npoin2D_xi_inner_core
+    buffer_send_faces_vector(1,ioffset + ipoin) = accel_inner_core(1,iboolleft_xi_inner_core(ipoin))
+    buffer_send_faces_vector(2,ioffset + ipoin) = accel_inner_core(2,iboolleft_xi_inner_core(ipoin))
+    buffer_send_faces_vector(3,ioffset + ipoin) = accel_inner_core(3,iboolleft_xi_inner_core(ipoin))
+  enddo
+
+! send messages backward along each row
+  if(iproc_xi == NPROC_XI-1) then
+#ifdef USE_MPI
+    sender = MPI_PROC_NULL
+#endif
+  else
+    sender = addressing(ichunk,iproc_xi + 1,iproc_eta)
+  endif
+  if(iproc_xi == 0) then
+#ifdef USE_MPI
+    receiver = MPI_PROC_NULL
+#endif
+  else
+    receiver = addressing(ichunk,iproc_xi - 1,iproc_eta)
+  endif
+#ifdef USE_MPI
+  call MPI_SENDRECV(buffer_send_faces_vector,NDIM*npoin2D_xi_all,CUSTOM_MPI_TYPE,receiver, &
+        itag2,buffer_received_faces_vector,NDIM*npoin2D_xi_all,CUSTOM_MPI_TYPE,sender, &
+        itag,MPI_COMM_WORLD,msg_status,ier)
+#endif
+
+! all slices copy the buffer received to the contributions on the right face
+  if(iproc_xi < NPROC_XI-1) then
+
+  do ipoin = 1,npoin2D_xi_crust_mantle
+    accel_crust_mantle(1,iboolright_xi_crust_mantle(ipoin)) = buffer_received_faces_vector(1,ipoin)
+    accel_crust_mantle(2,iboolright_xi_crust_mantle(ipoin)) = buffer_received_faces_vector(2,ipoin)
+    accel_crust_mantle(3,iboolright_xi_crust_mantle(ipoin)) = buffer_received_faces_vector(3,ipoin)
+  enddo
+
+  do ipoin = 1,npoin2D_xi_inner_core
+    accel_inner_core(1,iboolright_xi_inner_core(ipoin)) = buffer_received_faces_vector(1,ioffset + ipoin)
+    accel_inner_core(2,iboolright_xi_inner_core(ipoin)) = buffer_received_faces_vector(2,ioffset + ipoin)
+    accel_inner_core(3,iboolright_xi_inner_core(ipoin)) = buffer_received_faces_vector(3,ioffset + ipoin)
+  enddo
+
+  endif
+
+  endif
+
+!----
+!---- then assemble along eta using the 2-D topology
+!----
+
+! assemble along eta only if more than one slice
+  if(NPROC_ETA > 1) then
+
+! the buffer for the inner core starts right after the buffer for the crust and mantle
+  ioffset = npoin2D_eta_crust_mantle
+
+! slices copy the right face into the buffer
+  do ipoin = 1,npoin2D_eta_crust_mantle
+    buffer_send_faces_vector(1,ipoin) = accel_crust_mantle(1,iboolright_eta_crust_mantle(ipoin))
+    buffer_send_faces_vector(2,ipoin) = accel_crust_mantle(2,iboolright_eta_crust_mantle(ipoin))
+    buffer_send_faces_vector(3,ipoin) = accel_crust_mantle(3,iboolright_eta_crust_mantle(ipoin))
+  enddo
+
+  do ipoin = 1,npoin2D_eta_inner_core
+    buffer_send_faces_vector(1,ioffset + ipoin) = accel_inner_core(1,iboolright_eta_inner_core(ipoin))
+    buffer_send_faces_vector(2,ioffset + ipoin) = accel_inner_core(2,iboolright_eta_inner_core(ipoin))
+    buffer_send_faces_vector(3,ioffset + ipoin) = accel_inner_core(3,iboolright_eta_inner_core(ipoin))
+  enddo
+
+! send messages forward along each row
+  if(iproc_eta == 0) then
+#ifdef USE_MPI
+    sender = MPI_PROC_NULL
+#endif
+  else
+    sender = addressing(ichunk,iproc_xi,iproc_eta - 1)
+  endif
+  if(iproc_eta == NPROC_ETA-1) then
+#ifdef USE_MPI
+    receiver = MPI_PROC_NULL
+#endif
+  else
+    receiver = addressing(ichunk,iproc_xi,iproc_eta + 1)
+  endif
+#ifdef USE_MPI
+  call MPI_SENDRECV(buffer_send_faces_vector,NDIM*npoin2D_eta_all,CUSTOM_MPI_TYPE,receiver, &
+    itag2,buffer_received_faces_vector,NDIM*npoin2D_eta_all,CUSTOM_MPI_TYPE,sender, &
+    itag,MPI_COMM_WORLD,msg_status,ier)
+#endif
+
+! all slices add the buffer received to the contributions on the left face
+  if(iproc_eta > 0) then
+
+  do ipoin = 1,npoin2D_eta_crust_mantle
+    accel_crust_mantle(1,iboolleft_eta_crust_mantle(ipoin)) = accel_crust_mantle(1,iboolleft_eta_crust_mantle(ipoin)) + &
+                              buffer_received_faces_vector(1,ipoin)
+    accel_crust_mantle(2,iboolleft_eta_crust_mantle(ipoin)) = accel_crust_mantle(2,iboolleft_eta_crust_mantle(ipoin)) + &
+                              buffer_received_faces_vector(2,ipoin)
+    accel_crust_mantle(3,iboolleft_eta_crust_mantle(ipoin)) = accel_crust_mantle(3,iboolleft_eta_crust_mantle(ipoin)) + &
+                              buffer_received_faces_vector(3,ipoin)
+  enddo
+
+  do ipoin = 1,npoin2D_eta_inner_core
+    accel_inner_core(1,iboolleft_eta_inner_core(ipoin)) = accel_inner_core(1,iboolleft_eta_inner_core(ipoin)) + &
+                              buffer_received_faces_vector(1,ioffset + ipoin)
+    accel_inner_core(2,iboolleft_eta_inner_core(ipoin)) = accel_inner_core(2,iboolleft_eta_inner_core(ipoin)) + &
+                              buffer_received_faces_vector(2,ioffset + ipoin)
+    accel_inner_core(3,iboolleft_eta_inner_core(ipoin)) = accel_inner_core(3,iboolleft_eta_inner_core(ipoin)) + &
+                              buffer_received_faces_vector(3,ioffset + ipoin)
+  enddo
+
+  endif
+
+! the contributions are correctly assembled on the left side of each slice
+! now we have to send the result back to the sender
+! all slices copy the left face into the buffer
+  do ipoin = 1,npoin2D_eta_crust_mantle
+    buffer_send_faces_vector(1,ipoin) = accel_crust_mantle(1,iboolleft_eta_crust_mantle(ipoin))
+    buffer_send_faces_vector(2,ipoin) = accel_crust_mantle(2,iboolleft_eta_crust_mantle(ipoin))
+    buffer_send_faces_vector(3,ipoin) = accel_crust_mantle(3,iboolleft_eta_crust_mantle(ipoin))
+  enddo
+
+  do ipoin = 1,npoin2D_eta_inner_core
+    buffer_send_faces_vector(1,ioffset + ipoin) = accel_inner_core(1,iboolleft_eta_inner_core(ipoin))
+    buffer_send_faces_vector(2,ioffset + ipoin) = accel_inner_core(2,iboolleft_eta_inner_core(ipoin))
+    buffer_send_faces_vector(3,ioffset + ipoin) = accel_inner_core(3,iboolleft_eta_inner_core(ipoin))
+  enddo
+
+! send messages backward along each row
+  if(iproc_eta == NPROC_ETA-1) then
+#ifdef USE_MPI
+    sender = MPI_PROC_NULL
+#endif
+  else
+    sender = addressing(ichunk,iproc_xi,iproc_eta + 1)
+  endif
+  if(iproc_eta == 0) then
+#ifdef USE_MPI
+    receiver = MPI_PROC_NULL
+#endif
+  else
+    receiver = addressing(ichunk,iproc_xi,iproc_eta - 1)
+  endif
+#ifdef USE_MPI
+  call MPI_SENDRECV(buffer_send_faces_vector,NDIM*npoin2D_eta_all,CUSTOM_MPI_TYPE,receiver, &
+    itag2,buffer_received_faces_vector,NDIM*npoin2D_eta_all,CUSTOM_MPI_TYPE,sender, &
+    itag,MPI_COMM_WORLD,msg_status,ier)
+#endif
+
+! all slices copy the buffer received to the contributions on the right face
+  if(iproc_eta < NPROC_ETA-1) then
+
+  do ipoin = 1,npoin2D_eta_crust_mantle
+    accel_crust_mantle(1,iboolright_eta_crust_mantle(ipoin)) = buffer_received_faces_vector(1,ipoin)
+    accel_crust_mantle(2,iboolright_eta_crust_mantle(ipoin)) = buffer_received_faces_vector(2,ipoin)
+    accel_crust_mantle(3,iboolright_eta_crust_mantle(ipoin)) = buffer_received_faces_vector(3,ipoin)
+  enddo
+
+  do ipoin = 1,npoin2D_eta_inner_core
+    accel_inner_core(1,iboolright_eta_inner_core(ipoin)) = buffer_received_faces_vector(1,ioffset + ipoin)
+    accel_inner_core(2,iboolright_eta_inner_core(ipoin)) = buffer_received_faces_vector(2,ioffset + ipoin)
+    accel_inner_core(3,iboolright_eta_inner_core(ipoin)) = buffer_received_faces_vector(3,ioffset + ipoin)
+  enddo
+
+  endif
+
+  endif
+
+!----
+!---- start MPI assembling phase between chunks
+!----
+
+! check flag to see if we need to assemble (might be turned off when debugging)
+! and do not assemble if only one chunk
+  if (.not. ACTUALLY_ASSEMBLE_MPI_CHUNKS .or. NCHUNKS == 1) return
+
+! ***************************************************************
+!  transmit messages in forward direction (iprocfrom -> iprocto)
+! ***************************************************************
+
+!---- put slices in receive mode
+!---- a given slice can belong to at most two faces
+
+! use three step scheme that can never deadlock
+! scheme for faces cannot deadlock even if NPROC_XI = NPROC_ETA = 1
+  do imsg_loop = 1,NUM_MSG_TYPES
+
+  icount_faces = 0
+  do imsg = 1,NUMMSGS_FACES
+  if(myrank==iprocfrom_faces(imsg) .or. &
+       myrank==iprocto_faces(imsg)) icount_faces = icount_faces + 1
+  if(myrank==iprocto_faces(imsg) .and. imsg_type(imsg) == imsg_loop) then
+    sender = iprocfrom_faces(imsg)
+
+! size of buffers is the sum of two sizes because we handle two regions in the same MPI call
+    npoin2D_chunks_all = npoin2D_faces_crust_mantle(icount_faces) + npoin2D_faces_inner_core(icount_faces)
+
+! the buffer for the inner core starts right after the buffer for the crust and mantle
+    ioffset = npoin2D_faces_crust_mantle(icount_faces)
+
+#ifdef USE_MPI
+    call MPI_RECV(buffer_received_faces_vector,NDIM*npoin2D_chunks_all,CUSTOM_MPI_TYPE,sender, &
+              itag,MPI_COMM_WORLD,msg_status,ier)
+#endif
+
+    do ipoin2D = 1,npoin2D_faces_crust_mantle(icount_faces)
+      accel_crust_mantle(1,iboolfaces_crust_mantle(ipoin2D,icount_faces)) = &
+         accel_crust_mantle(1,iboolfaces_crust_mantle(ipoin2D,icount_faces)) + buffer_received_faces_vector(1,ipoin2D)
+      accel_crust_mantle(2,iboolfaces_crust_mantle(ipoin2D,icount_faces)) = &
+         accel_crust_mantle(2,iboolfaces_crust_mantle(ipoin2D,icount_faces)) + buffer_received_faces_vector(2,ipoin2D)
+      accel_crust_mantle(3,iboolfaces_crust_mantle(ipoin2D,icount_faces)) = &
+         accel_crust_mantle(3,iboolfaces_crust_mantle(ipoin2D,icount_faces)) + buffer_received_faces_vector(3,ipoin2D)
+    enddo
+
+    do ipoin2D = 1,npoin2D_faces_inner_core(icount_faces)
+      accel_inner_core(1,iboolfaces_inner_core(ipoin2D,icount_faces)) = &
+         accel_inner_core(1,iboolfaces_inner_core(ipoin2D,icount_faces)) + buffer_received_faces_vector(1,ioffset + ipoin2D)
+      accel_inner_core(2,iboolfaces_inner_core(ipoin2D,icount_faces)) = &
+         accel_inner_core(2,iboolfaces_inner_core(ipoin2D,icount_faces)) + buffer_received_faces_vector(2,ioffset + ipoin2D)
+      accel_inner_core(3,iboolfaces_inner_core(ipoin2D,icount_faces)) = &
+         accel_inner_core(3,iboolfaces_inner_core(ipoin2D,icount_faces)) + buffer_received_faces_vector(3,ioffset + ipoin2D)
+    enddo
+
+  endif
+  enddo
+
+!---- put slices in send mode
+!---- a given slice can belong to at most two faces
+  icount_faces = 0
+  do imsg = 1,NUMMSGS_FACES
+  if(myrank==iprocfrom_faces(imsg) .or. &
+       myrank==iprocto_faces(imsg)) icount_faces = icount_faces + 1
+  if(myrank==iprocfrom_faces(imsg) .and. imsg_type(imsg) == imsg_loop) then
+    receiver = iprocto_faces(imsg)
+
+! size of buffers is the sum of two sizes because we handle two regions in the same MPI call
+    npoin2D_chunks_all = npoin2D_faces_crust_mantle(icount_faces) + npoin2D_faces_inner_core(icount_faces)
+
+! the buffer for the inner core starts right after the buffer for the crust and mantle
+    ioffset = npoin2D_faces_crust_mantle(icount_faces)
+
+    do ipoin2D = 1,npoin2D_faces_crust_mantle(icount_faces)
+      buffer_send_faces_vector(1,ipoin2D) = accel_crust_mantle(1,iboolfaces_crust_mantle(ipoin2D,icount_faces))
+      buffer_send_faces_vector(2,ipoin2D) = accel_crust_mantle(2,iboolfaces_crust_mantle(ipoin2D,icount_faces))
+      buffer_send_faces_vector(3,ipoin2D) = accel_crust_mantle(3,iboolfaces_crust_mantle(ipoin2D,icount_faces))
+    enddo
+
+    do ipoin2D = 1,npoin2D_faces_inner_core(icount_faces)
+      buffer_send_faces_vector(1,ioffset + ipoin2D) = accel_inner_core(1,iboolfaces_inner_core(ipoin2D,icount_faces))
+      buffer_send_faces_vector(2,ioffset + ipoin2D) = accel_inner_core(2,iboolfaces_inner_core(ipoin2D,icount_faces))
+      buffer_send_faces_vector(3,ioffset + ipoin2D) = accel_inner_core(3,iboolfaces_inner_core(ipoin2D,icount_faces))
+    enddo
+
+#ifdef USE_MPI
+    call MPI_SEND(buffer_send_faces_vector,NDIM*npoin2D_chunks_all,CUSTOM_MPI_TYPE,receiver,itag,MPI_COMM_WORLD,ier)
+#endif
+
+  endif
+  enddo
+
+
+! *********************************************************************
+!  transmit messages back in opposite direction (iprocto -> iprocfrom)
+! *********************************************************************
+
+!---- put slices in receive mode
+!---- a given slice can belong to at most two faces
+
+  icount_faces = 0
+  do imsg = 1,NUMMSGS_FACES
+  if(myrank==iprocfrom_faces(imsg) .or. &
+       myrank==iprocto_faces(imsg)) icount_faces = icount_faces + 1
+  if(myrank==iprocfrom_faces(imsg) .and. imsg_type(imsg) == imsg_loop) then
+    sender = iprocto_faces(imsg)
+
+! size of buffers is the sum of two sizes because we handle two regions in the same MPI call
+    npoin2D_chunks_all = npoin2D_faces_crust_mantle(icount_faces) + npoin2D_faces_inner_core(icount_faces)
+
+! the buffer for the inner core starts right after the buffer for the crust and mantle
+    ioffset = npoin2D_faces_crust_mantle(icount_faces)
+
+#ifdef USE_MPI
+    call MPI_RECV(buffer_received_faces_vector,NDIM*npoin2D_chunks_all,CUSTOM_MPI_TYPE,sender, &
+              itag,MPI_COMM_WORLD,msg_status,ier)
+#endif
+
+    do ipoin2D = 1,npoin2D_faces_crust_mantle(icount_faces)
+      accel_crust_mantle(1,iboolfaces_crust_mantle(ipoin2D,icount_faces)) = buffer_received_faces_vector(1,ipoin2D)
+      accel_crust_mantle(2,iboolfaces_crust_mantle(ipoin2D,icount_faces)) = buffer_received_faces_vector(2,ipoin2D)
+      accel_crust_mantle(3,iboolfaces_crust_mantle(ipoin2D,icount_faces)) = buffer_received_faces_vector(3,ipoin2D)
+    enddo
+
+    do ipoin2D = 1,npoin2D_faces_inner_core(icount_faces)
+      accel_inner_core(1,iboolfaces_inner_core(ipoin2D,icount_faces)) = buffer_received_faces_vector(1,ioffset + ipoin2D)
+      accel_inner_core(2,iboolfaces_inner_core(ipoin2D,icount_faces)) = buffer_received_faces_vector(2,ioffset + ipoin2D)
+      accel_inner_core(3,iboolfaces_inner_core(ipoin2D,icount_faces)) = buffer_received_faces_vector(3,ioffset + ipoin2D)
+    enddo
+
+  endif
+  enddo
+
+!---- put slices in send mode
+!---- a given slice can belong to at most two faces
+  icount_faces = 0
+  do imsg = 1,NUMMSGS_FACES
+  if(myrank==iprocfrom_faces(imsg) .or. &
+       myrank==iprocto_faces(imsg)) icount_faces = icount_faces + 1
+  if(myrank==iprocto_faces(imsg) .and. imsg_type(imsg) == imsg_loop) then
+    receiver = iprocfrom_faces(imsg)
+
+! size of buffers is the sum of two sizes because we handle two regions in the same MPI call
+    npoin2D_chunks_all = npoin2D_faces_crust_mantle(icount_faces) + npoin2D_faces_inner_core(icount_faces)
+
+! the buffer for the inner core starts right after the buffer for the crust and mantle
+    ioffset = npoin2D_faces_crust_mantle(icount_faces)
+
+    do ipoin2D = 1,npoin2D_faces_crust_mantle(icount_faces)
+      buffer_send_faces_vector(1,ipoin2D) = accel_crust_mantle(1,iboolfaces_crust_mantle(ipoin2D,icount_faces))
+      buffer_send_faces_vector(2,ipoin2D) = accel_crust_mantle(2,iboolfaces_crust_mantle(ipoin2D,icount_faces))
+      buffer_send_faces_vector(3,ipoin2D) = accel_crust_mantle(3,iboolfaces_crust_mantle(ipoin2D,icount_faces))
+    enddo
+
+    do ipoin2D = 1,npoin2D_faces_inner_core(icount_faces)
+      buffer_send_faces_vector(1,ioffset + ipoin2D) = accel_inner_core(1,iboolfaces_inner_core(ipoin2D,icount_faces))
+      buffer_send_faces_vector(2,ioffset + ipoin2D) = accel_inner_core(2,iboolfaces_inner_core(ipoin2D,icount_faces))
+      buffer_send_faces_vector(3,ioffset + ipoin2D) = accel_inner_core(3,iboolfaces_inner_core(ipoin2D,icount_faces))
+    enddo
+
+#ifdef USE_MPI
+    call MPI_SEND(buffer_send_faces_vector,NDIM*npoin2D_chunks_all,CUSTOM_MPI_TYPE,receiver,itag,MPI_COMM_WORLD,ier)
+#endif
+
+  endif
+  enddo
+
+! end of anti-deadlocking loop
+  enddo
+
+!----
+!---- start MPI assembling corners
+!----
+
+! scheme for corners cannot deadlock even if NPROC_XI = NPROC_ETA = 1
+
+! size of buffers is the sum of two sizes because we handle two regions in the same MPI call
+  NGLOB1D_RADIAL_all = NGLOB1D_RADIAL_crust_mantle + NGLOB1D_RADIAL_inner_core
+
+! the buffer for the inner core starts right after the buffer for the crust and mantle
+  ioffset = NGLOB1D_RADIAL_crust_mantle
+
+! ***************************************************************
+!  transmit messages in forward direction (two workers -> master)
+! ***************************************************************
+
+  icount_corners = 0
+
+  do imsg = 1,NCORNERSCHUNKS
+
+  if(myrank == iproc_master_corners(imsg) .or. &
+     myrank == iproc_worker1_corners(imsg) .or. &
+     (NCHUNKS /= 2 .and. myrank == iproc_worker2_corners(imsg))) icount_corners = icount_corners + 1
+
+!---- receive messages from the two workers on the master
+  if(myrank==iproc_master_corners(imsg)) then
+
+! receive from worker #1 and add to local array
+    sender = iproc_worker1_corners(imsg)
+
+#ifdef USE_MPI
+    call MPI_RECV(buffer_recv_chunkcorners_vector,NDIM*NGLOB1D_RADIAL_all, &
+          CUSTOM_MPI_TYPE,sender,itag,MPI_COMM_WORLD,msg_status,ier)
+#endif
+
+    do ipoin1D = 1,NGLOB1D_RADIAL_crust_mantle
+      accel_crust_mantle(1,iboolcorner_crust_mantle(ipoin1D,icount_corners)) = &
+               accel_crust_mantle(1,iboolcorner_crust_mantle(ipoin1D,icount_corners)) + &
+               buffer_recv_chunkcorners_vector(1,ipoin1D)
+      accel_crust_mantle(2,iboolcorner_crust_mantle(ipoin1D,icount_corners)) = &
+               accel_crust_mantle(2,iboolcorner_crust_mantle(ipoin1D,icount_corners)) + &
+               buffer_recv_chunkcorners_vector(2,ipoin1D)
+      accel_crust_mantle(3,iboolcorner_crust_mantle(ipoin1D,icount_corners)) = &
+               accel_crust_mantle(3,iboolcorner_crust_mantle(ipoin1D,icount_corners)) + &
+               buffer_recv_chunkcorners_vector(3,ipoin1D)
+    enddo
+
+    do ipoin1D = 1,NGLOB1D_RADIAL_inner_core
+      accel_inner_core(1,iboolcorner_inner_core(ipoin1D,icount_corners)) = &
+               accel_inner_core(1,iboolcorner_inner_core(ipoin1D,icount_corners)) + &
+               buffer_recv_chunkcorners_vector(1,ioffset + ipoin1D)
+      accel_inner_core(2,iboolcorner_inner_core(ipoin1D,icount_corners)) = &
+               accel_inner_core(2,iboolcorner_inner_core(ipoin1D,icount_corners)) + &
+               buffer_recv_chunkcorners_vector(2,ioffset + ipoin1D)
+      accel_inner_core(3,iboolcorner_inner_core(ipoin1D,icount_corners)) = &
+               accel_inner_core(3,iboolcorner_inner_core(ipoin1D,icount_corners)) + &
+               buffer_recv_chunkcorners_vector(3,ioffset + ipoin1D)
+    enddo
+
+! receive from worker #2 and add to local array
+  if(NCHUNKS /= 2) then
+
+    sender = iproc_worker2_corners(imsg)
+
+#ifdef USE_MPI
+    call MPI_RECV(buffer_recv_chunkcorners_vector,NDIM*NGLOB1D_RADIAL_all, &
+          CUSTOM_MPI_TYPE,sender,itag,MPI_COMM_WORLD,msg_status,ier)
+#endif
+
+    do ipoin1D = 1,NGLOB1D_RADIAL_crust_mantle
+      accel_crust_mantle(1,iboolcorner_crust_mantle(ipoin1D,icount_corners)) = &
+               accel_crust_mantle(1,iboolcorner_crust_mantle(ipoin1D,icount_corners)) + &
+               buffer_recv_chunkcorners_vector(1,ipoin1D)
+      accel_crust_mantle(2,iboolcorner_crust_mantle(ipoin1D,icount_corners)) = &
+               accel_crust_mantle(2,iboolcorner_crust_mantle(ipoin1D,icount_corners)) + &
+               buffer_recv_chunkcorners_vector(2,ipoin1D)
+      accel_crust_mantle(3,iboolcorner_crust_mantle(ipoin1D,icount_corners)) = &
+               accel_crust_mantle(3,iboolcorner_crust_mantle(ipoin1D,icount_corners)) + &
+               buffer_recv_chunkcorners_vector(3,ipoin1D)
+    enddo
+
+    do ipoin1D = 1,NGLOB1D_RADIAL_inner_core
+      accel_inner_core(1,iboolcorner_inner_core(ipoin1D,icount_corners)) = &
+               accel_inner_core(1,iboolcorner_inner_core(ipoin1D,icount_corners)) + &
+               buffer_recv_chunkcorners_vector(1,ioffset + ipoin1D)
+      accel_inner_core(2,iboolcorner_inner_core(ipoin1D,icount_corners)) = &
+               accel_inner_core(2,iboolcorner_inner_core(ipoin1D,icount_corners)) + &
+               buffer_recv_chunkcorners_vector(2,ioffset + ipoin1D)
+      accel_inner_core(3,iboolcorner_inner_core(ipoin1D,icount_corners)) = &
+               accel_inner_core(3,iboolcorner_inner_core(ipoin1D,icount_corners)) + &
+               buffer_recv_chunkcorners_vector(3,ioffset + ipoin1D)
+    enddo
+
+  endif
+
+  endif
+
+!---- send messages from the two workers to the master
+  if(myrank==iproc_worker1_corners(imsg) .or. &
+              (NCHUNKS /= 2 .and. myrank==iproc_worker2_corners(imsg))) then
+
+    receiver = iproc_master_corners(imsg)
+
+    do ipoin1D = 1,NGLOB1D_RADIAL_crust_mantle
+      buffer_send_chunkcorners_vector(1,ipoin1D) = accel_crust_mantle(1,iboolcorner_crust_mantle(ipoin1D,icount_corners))
+      buffer_send_chunkcorners_vector(2,ipoin1D) = accel_crust_mantle(2,iboolcorner_crust_mantle(ipoin1D,icount_corners))
+      buffer_send_chunkcorners_vector(3,ipoin1D) = accel_crust_mantle(3,iboolcorner_crust_mantle(ipoin1D,icount_corners))
+    enddo
+
+    do ipoin1D = 1,NGLOB1D_RADIAL_inner_core
+      buffer_send_chunkcorners_vector(1,ioffset + ipoin1D) = accel_inner_core(1,iboolcorner_inner_core(ipoin1D,icount_corners))
+      buffer_send_chunkcorners_vector(2,ioffset + ipoin1D) = accel_inner_core(2,iboolcorner_inner_core(ipoin1D,icount_corners))
+      buffer_send_chunkcorners_vector(3,ioffset + ipoin1D) = accel_inner_core(3,iboolcorner_inner_core(ipoin1D,icount_corners))
+    enddo
+
+#ifdef USE_MPI
+    call MPI_SEND(buffer_send_chunkcorners_vector,NDIM*NGLOB1D_RADIAL_all,CUSTOM_MPI_TYPE,receiver,itag,MPI_COMM_WORLD,ier)
+#endif
+
+  endif
+
+! *********************************************************************
+!  transmit messages back in opposite direction (master -> two workers)
+! *********************************************************************
+
+!---- receive messages from the master on the two workers
+  if(myrank==iproc_worker1_corners(imsg) .or. &
+              (NCHUNKS /= 2 .and. myrank==iproc_worker2_corners(imsg))) then
+
+! receive from master and copy to local array
+    sender = iproc_master_corners(imsg)
+
+#ifdef USE_MPI
+    call MPI_RECV(buffer_recv_chunkcorners_vector,NDIM*NGLOB1D_RADIAL_all, &
+          CUSTOM_MPI_TYPE,sender,itag,MPI_COMM_WORLD,msg_status,ier)
+#endif
+
+    do ipoin1D = 1,NGLOB1D_RADIAL_crust_mantle
+      accel_crust_mantle(1,iboolcorner_crust_mantle(ipoin1D,icount_corners)) = buffer_recv_chunkcorners_vector(1,ipoin1D)
+      accel_crust_mantle(2,iboolcorner_crust_mantle(ipoin1D,icount_corners)) = buffer_recv_chunkcorners_vector(2,ipoin1D)
+      accel_crust_mantle(3,iboolcorner_crust_mantle(ipoin1D,icount_corners)) = buffer_recv_chunkcorners_vector(3,ipoin1D)
+    enddo
+
+    do ipoin1D = 1,NGLOB1D_RADIAL_inner_core
+      accel_inner_core(1,iboolcorner_inner_core(ipoin1D,icount_corners)) = buffer_recv_chunkcorners_vector(1,ioffset + ipoin1D)
+      accel_inner_core(2,iboolcorner_inner_core(ipoin1D,icount_corners)) = buffer_recv_chunkcorners_vector(2,ioffset + ipoin1D)
+      accel_inner_core(3,iboolcorner_inner_core(ipoin1D,icount_corners)) = buffer_recv_chunkcorners_vector(3,ioffset + ipoin1D)
+    enddo
+
+  endif
+
+!---- send messages from the master to the two workers
+  if(myrank==iproc_master_corners(imsg)) then
+
+    do ipoin1D = 1,NGLOB1D_RADIAL_crust_mantle
+      buffer_send_chunkcorners_vector(1,ipoin1D) = accel_crust_mantle(1,iboolcorner_crust_mantle(ipoin1D,icount_corners))
+      buffer_send_chunkcorners_vector(2,ipoin1D) = accel_crust_mantle(2,iboolcorner_crust_mantle(ipoin1D,icount_corners))
+      buffer_send_chunkcorners_vector(3,ipoin1D) = accel_crust_mantle(3,iboolcorner_crust_mantle(ipoin1D,icount_corners))
+    enddo
+
+    do ipoin1D = 1,NGLOB1D_RADIAL_inner_core
+      buffer_send_chunkcorners_vector(1,ioffset + ipoin1D) = accel_inner_core(1,iboolcorner_inner_core(ipoin1D,icount_corners))
+      buffer_send_chunkcorners_vector(2,ioffset + ipoin1D) = accel_inner_core(2,iboolcorner_inner_core(ipoin1D,icount_corners))
+      buffer_send_chunkcorners_vector(3,ioffset + ipoin1D) = accel_inner_core(3,iboolcorner_inner_core(ipoin1D,icount_corners))
+    enddo
+
+! send to worker #1
+    receiver = iproc_worker1_corners(imsg)
+#ifdef USE_MPI
+    call MPI_SEND(buffer_send_chunkcorners_vector,NDIM*NGLOB1D_RADIAL_all,CUSTOM_MPI_TYPE,receiver,itag,MPI_COMM_WORLD,ier)
+#endif
+
+! send to worker #2
+  if(NCHUNKS /= 2) then
+    receiver = iproc_worker2_corners(imsg)
+#ifdef USE_MPI
+    call MPI_SEND(buffer_send_chunkcorners_vector,NDIM*NGLOB1D_RADIAL_all,CUSTOM_MPI_TYPE,receiver,itag,MPI_COMM_WORLD,ier)
+#endif
+
+  endif
+
+  endif
+
+  enddo
+
+  end subroutine assemble_MPI_vector_block
+

Copied: seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/compute_forces_CM_IC.F90 (from rev 12881, seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/compute_forces_CM_IC.f90)
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/compute_forces_CM_IC.F90	                        (rev 0)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/compute_forces_CM_IC.F90	2008-09-17 18:17:25 UTC (rev 12910)
@@ -0,0 +1,1062 @@
+!=====================================================================
+!
+!          S p e c f e m 3 D  G l o b e  V e r s i o n  4 . 1
+!          --------------------------------------------------
+!
+!          Main authors: Dimitri Komatitsch and Jeroen Tromp
+!    Seismological Laboratory, California Institute of Technology, USA
+!             and University of Pau / CNRS / INRIA, France
+! (c) California Institute of Technology and University of Pau / CNRS / INRIA
+!                            August 2008
+!
+! This program is free software; you can redistribute it and/or modify
+! it under the terms of the GNU General Public License as published by
+! the Free Software Foundation; either version 2 of the License, or
+! (at your option) any later version.
+!
+! This program is distributed in the hope that it will be useful,
+! but WITHOUT ANY WARRANTY; without even the implied warranty of
+! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+! GNU General Public License for more details.
+!
+! You should have received a copy of the GNU General Public License along
+! with this program; if not, write to the Free Software Foundation, Inc.,
+! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+!
+!=====================================================================
+
+! compute forces in the crust/mantle and in the inner core (i.e., all the solid regions)
+  subroutine compute_forces_CM_IC(displ_crust_mantle,accel_crust_mantle, &
+          xstore_crust_mantle,ystore_crust_mantle,zstore_crust_mantle, &
+          xix_crust_mantle,xiy_crust_mantle,xiz_crust_mantle,etax_crust_mantle,etay_crust_mantle,etaz_crust_mantle, &
+          gammax_crust_mantle,gammay_crust_mantle,gammaz_crust_mantle, &
+          kappavstore_crust_mantle,kappahstore_crust_mantle,muvstore_crust_mantle,muhstore_crust_mantle, &
+          eta_anisostore_crust_mantle, &
+          ibool_crust_mantle,idoubling_crust_mantle,R_memory_crust_mantle,epsilondev_crust_mantle,one_minus_sum_beta_crust_mantle, &
+          factor_common_crust_mantle,vx_crust_mantle,vy_crust_mantle,vz_crust_mantle,vnspec_crust_mantle, &
+          displ_inner_core,accel_inner_core, &
+          xix_inner_core,xiy_inner_core,xiz_inner_core,etax_inner_core,etay_inner_core,etaz_inner_core, &
+          gammax_inner_core,gammay_inner_core,gammaz_inner_core, &
+          kappavstore_inner_core,muvstore_inner_core,ibool_inner_core,idoubling_inner_core, &
+          R_memory_inner_core,epsilondev_inner_core,&
+          one_minus_sum_beta_inner_core,factor_common_inner_core, &
+          vx_inner_core,vy_inner_core,vz_inner_core,vnspec_inner_core, &
+          hprime_xx,hprime_yy,hprime_zz, &
+          hprimewgll_xx,hprimewgll_yy,hprimewgll_zz, &
+          wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, &
+          alphaval,betaval,gammaval, &
+#ifdef USE_MPI
+            is_on_a_slice_edge_crust_mantle,is_on_a_slice_edge_inner_core, &
+            myrank,iproc_xi,iproc_eta,ichunk,addressing, &
+            iboolleft_xi_crust_mantle,iboolright_xi_crust_mantle,iboolleft_eta_crust_mantle,iboolright_eta_crust_mantle, &
+            npoin2D_faces_crust_mantle,npoin2D_xi_crust_mantle,npoin2D_eta_crust_mantle, &
+            iboolfaces_crust_mantle,iboolcorner_crust_mantle, &
+            iboolleft_xi_inner_core,iboolright_xi_inner_core,iboolleft_eta_inner_core,iboolright_eta_inner_core, &
+            npoin2D_faces_inner_core,npoin2D_xi_inner_core,npoin2D_eta_inner_core, &
+            iboolfaces_inner_core,iboolcorner_inner_core, &
+            iprocfrom_faces,iprocto_faces,imsg_type, &
+            iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
+            buffer_send_faces,buffer_received_faces,npoin2D_max_all, &
+            buffer_send_chunkcorners_vector,buffer_recv_chunkcorners_vector, &
+            NUM_MSG_TYPES,iphase, &
+#endif
+          COMPUTE_AND_STORE_STRAIN,AM_V,icall)
+
+  implicit none
+
+  include "constants.h"
+
+! include values created by the mesher
+! done for performance only using static allocation to allow for loop unrolling
+  include "values_from_mesher.h"
+
+  integer :: icall
+
+! attenuation_model_variables
+  type attenuation_model_variables
+    sequence
+    double precision min_period, max_period
+    double precision                          :: QT_c_source        ! Source Frequency
+    double precision, dimension(N_SLS)        :: Qtau_s             ! tau_sigma
+    double precision, dimension(:), pointer   :: QrDisc             ! Discontinutitues Defined
+    double precision, dimension(:), pointer   :: Qr                 ! Radius
+    integer, dimension(:), pointer            :: interval_Q                 ! Steps
+    double precision, dimension(:), pointer   :: Qmu                ! Shear Attenuation
+    double precision, dimension(:,:), pointer :: Qtau_e             ! tau_epsilon
+    double precision, dimension(:), pointer   :: Qone_minus_sum_beta, Qone_minus_sum_beta2      ! one_minus_sum_beta
+    double precision, dimension(:,:), pointer :: Qfc, Qfc2          ! factor_common
+    double precision, dimension(:), pointer   :: Qsf, Qsf2          ! scale_factor
+    integer, dimension(:), pointer            :: Qrmin              ! Max and Mins of idoubling
+    integer, dimension(:), pointer            :: Qrmax              ! Max and Mins of idoubling
+    integer                                   :: Qn                 ! Number of points
+  end type attenuation_model_variables
+
+  type (attenuation_model_variables) AM_V
+! attenuation_model_variables
+
+! for forward or backward simulations
+  logical COMPUTE_AND_STORE_STRAIN
+
+! array with the local to global mapping per slice
+  integer, dimension(NSPEC_CRUST_MANTLE) :: idoubling_crust_mantle
+
+! displacement and acceleration
+  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_CRUST_MANTLE) :: displ_crust_mantle,accel_crust_mantle
+
+! memory variables for attenuation
+! memory variables R_ij are stored at the local rather than global level
+! to allow for optimization of cache access by compiler
+  integer i_sls,i_memory
+
+! variable lengths for factor_common and one_minus_sum_beta
+  integer :: vx_crust_mantle,vy_crust_mantle,vz_crust_mantle,vnspec_crust_mantle
+
+  real(kind=CUSTOM_REAL) one_minus_sum_beta_use,minus_sum_beta
+  real(kind=CUSTOM_REAL), dimension(vx_crust_mantle,vy_crust_mantle,vz_crust_mantle,vnspec_crust_mantle) :: &
+       one_minus_sum_beta_crust_mantle
+
+  integer iregion_selected
+
+! for attenuation
+  real(kind=CUSTOM_REAL) R_xx_val,R_yy_val
+  real(kind=CUSTOM_REAL), dimension(5,N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_ATTENUAT) :: R_memory_crust_mantle
+  real(kind=CUSTOM_REAL), dimension(5,NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE) :: epsilondev_crust_mantle
+
+! [alpha,beta,gamma]val reduced to N_SLS and factor_common to N_SLS*NUM_NODES
+  real(kind=CUSTOM_REAL), dimension(N_SLS) :: alphaval,betaval,gammaval
+  real(kind=CUSTOM_REAL), dimension(N_SLS,vx_crust_mantle,vy_crust_mantle,vz_crust_mantle,vnspec_crust_mantle) :: &
+       factor_common_crust_mantle
+
+  real(kind=CUSTOM_REAL), dimension(5,NGLLX,NGLLY,NGLLZ) :: epsilondev_loc
+
+! arrays with mesh parameters per slice
+  integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE) :: ibool_crust_mantle
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE) :: &
+        xix_crust_mantle,xiy_crust_mantle,xiz_crust_mantle,etax_crust_mantle,etay_crust_mantle,etaz_crust_mantle, &
+        gammax_crust_mantle,gammay_crust_mantle,gammaz_crust_mantle
+
+! array with derivatives of Lagrange polynomials and precalculated products
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLX) :: hprime_xx,hprimewgll_xx
+  real(kind=CUSTOM_REAL), dimension(NGLLY,NGLLY) :: hprime_yy,hprimewgll_yy
+  real(kind=CUSTOM_REAL), dimension(NGLLZ,NGLLZ) :: hprime_zz,hprimewgll_zz
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY) :: wgllwgll_xy
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLZ) :: wgllwgll_xz
+  real(kind=CUSTOM_REAL), dimension(NGLLY,NGLLZ) :: wgllwgll_yz
+
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: &
+    tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3
+
+! x y and z contain r theta and phi
+  real(kind=CUSTOM_REAL), dimension(NGLOB_CRUST_MANTLE) :: xstore_crust_mantle,ystore_crust_mantle,zstore_crust_mantle
+
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPECMAX_ISO_MANTLE) :: &
+        kappavstore_crust_mantle,muvstore_crust_mantle
+
+! store anisotropic properties only where needed to save memory
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPECMAX_TISO_MANTLE) :: &
+        kappahstore_crust_mantle,muhstore_crust_mantle,eta_anisostore_crust_mantle
+
+  integer ispec,iglob
+  integer i,j,k,l
+
+! the 21 coefficients for an anisotropic medium in reduced notation
+  real(kind=CUSTOM_REAL) c11,c22,c33,c44,c55,c66,c12,c13,c23,c14,c24,c34,c15,c25,c35,c45,c16,c26,c36,c46,c56
+
+  real(kind=CUSTOM_REAL) rhovphsq,sinphifour,cosphisq,sinphisq,costhetasq,rhovsvsq,sinthetasq, &
+        cosphifour,costhetafour,rhovpvsq,sinthetafour,rhovshsq,cosfourphi, &
+        costwotheta,cosfourtheta,sintwophisq,costheta,sinphi,sintheta,cosphi, &
+        sintwotheta,costwophi,sintwophi,costwothetasq,costwophisq,phi,theta
+
+  real(kind=CUSTOM_REAL) two_rhovpvsq,two_rhovphsq,two_rhovsvsq,two_rhovshsq
+  real(kind=CUSTOM_REAL) four_rhovpvsq,four_rhovphsq,four_rhovsvsq,four_rhovshsq
+
+  real(kind=CUSTOM_REAL) twoetaminone,etaminone,eta_aniso
+  real(kind=CUSTOM_REAL) two_eta_aniso,four_eta_aniso,six_eta_aniso
+
+  real(kind=CUSTOM_REAL) xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl
+  real(kind=CUSTOM_REAL) duxdxl,duxdyl,duxdzl,duydxl,duydyl,duydzl,duzdxl,duzdyl,duzdzl
+
+  real(kind=CUSTOM_REAL) duxdxl_plus_duydyl,duxdxl_plus_duzdzl,duydyl_plus_duzdzl
+  real(kind=CUSTOM_REAL) duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl
+
+  real(kind=CUSTOM_REAL) sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz
+
+  real(kind=CUSTOM_REAL) hp1,hp2,hp3
+  real(kind=CUSTOM_REAL) fac1,fac2,fac3
+  real(kind=CUSTOM_REAL) lambdal,mul,lambdalplus2mul
+  real(kind=CUSTOM_REAL) kappal,kappavl,kappahl,muvl,muhl
+  real(kind=CUSTOM_REAL), dimension(NDIM,NGLLX,NGLLY,NGLLZ) :: sum_terms
+
+  real(kind=CUSTOM_REAL) tempx1l,tempx2l,tempx3l
+  real(kind=CUSTOM_REAL) tempy1l,tempy2l,tempy3l
+  real(kind=CUSTOM_REAL) tempz1l,tempz2l,tempz3l
+
+  real(kind=CUSTOM_REAL) radius_cr
+
+!=====================================================================
+!=====================================================================
+!=====================================================================
+
+! same attenuation everywhere in the inner core therefore no need to use Brian's routines
+  integer, parameter :: iregion_selected_inner_core = 1
+
+! displacement and acceleration
+  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_INNER_CORE) :: displ_inner_core,accel_inner_core
+
+! variable lengths for factor_common and one_minus_sum_beta
+  integer :: vx_inner_core,vy_inner_core,vz_inner_core,vnspec_inner_core
+
+  real(kind=CUSTOM_REAL), dimension(vx_inner_core,vy_inner_core,vz_inner_core,vnspec_inner_core) :: one_minus_sum_beta_inner_core
+
+  real(kind=CUSTOM_REAL), dimension(N_SLS,vx_inner_core,vy_inner_core,vz_inner_core,vnspec_inner_core) :: factor_common_inner_core
+
+  real(kind=CUSTOM_REAL), dimension(5,N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE_ATTENUATION) :: R_memory_inner_core
+  real(kind=CUSTOM_REAL), dimension(5,NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE) :: epsilondev_inner_core
+
+! array with the local to global mapping per slice
+  integer, dimension(NSPEC_INNER_CORE) :: idoubling_inner_core
+
+! arrays with mesh parameters per slice
+  integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE) :: ibool_inner_core
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE) :: &
+       xix_inner_core,xiy_inner_core,xiz_inner_core,etax_inner_core,etay_inner_core,etaz_inner_core,&
+       gammax_inner_core,gammay_inner_core,gammaz_inner_core
+
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE) :: kappavstore_inner_core,muvstore_inner_core
+
+  integer :: computed_elements
+
+#ifdef USE_MPI
+  integer :: iphase
+
+  logical, dimension(NSPEC_CRUST_MANTLE) :: is_on_a_slice_edge_crust_mantle
+  logical, dimension(NSPEC_INNER_CORE) :: is_on_a_slice_edge_inner_core
+
+  integer :: ichunk,iproc_xi,iproc_eta,myrank
+
+  integer, dimension(NCHUNKS_VAL,0:NPROC_XI_VAL-1,0:NPROC_ETA_VAL-1) :: addressing
+
+  integer, dimension(NGLOB2DMAX_XMIN_XMAX_CM) :: iboolleft_xi_crust_mantle,iboolright_xi_crust_mantle
+  integer, dimension(NGLOB2DMAX_YMIN_YMAX_CM) :: iboolleft_eta_crust_mantle,iboolright_eta_crust_mantle
+
+  integer, dimension(NGLOB2DMAX_XMIN_XMAX_IC) :: iboolleft_xi_inner_core,iboolright_xi_inner_core
+  integer, dimension(NGLOB2DMAX_YMIN_YMAX_IC) :: iboolleft_eta_inner_core,iboolright_eta_inner_core
+
+  integer npoin2D_faces_crust_mantle(NUMFACES_SHARED)
+  integer npoin2D_faces_inner_core(NUMFACES_SHARED)
+
+  integer, dimension(NB_SQUARE_EDGES_ONEDIR) :: npoin2D_xi_crust_mantle,npoin2D_eta_crust_mantle
+  integer, dimension(NB_SQUARE_EDGES_ONEDIR) :: npoin2D_xi_inner_core,npoin2D_eta_inner_core
+
+! communication pattern for faces between chunks
+  integer, dimension(NUMMSGS_FACES_VAL) :: iprocfrom_faces,iprocto_faces,imsg_type
+
+! communication pattern for corners between chunks
+  integer, dimension(NCORNERSCHUNKS_VAL) :: iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners
+
+  integer, dimension(NGLOB1D_RADIAL_CM,NUMCORNERS_SHARED) :: iboolcorner_crust_mantle
+  integer, dimension(NGLOB1D_RADIAL_IC,NUMCORNERS_SHARED) :: iboolcorner_inner_core
+
+  integer, dimension(NGLOB2DMAX_XY_VAL_CM,NUMFACES_SHARED) :: iboolfaces_crust_mantle
+  integer, dimension(NGLOB2DMAX_XY_VAL_IC,NUMFACES_SHARED) :: iboolfaces_inner_core
+
+  integer :: npoin2D_max_all
+
+  real(kind=CUSTOM_REAL), dimension(NDIM,npoin2D_max_all) :: buffer_send_faces,buffer_received_faces
+
+! size of buffers is the sum of two sizes because we handle two regions in the same MPI call
+  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB1D_RADIAL_CM + NGLOB1D_RADIAL_IC) :: &
+     buffer_send_chunkcorners_vector,buffer_recv_chunkcorners_vector
+
+! number of message types
+  integer NUM_MSG_TYPES
+#endif
+
+! ****************************************************
+!   big loop over all spectral elements in the solid
+! ****************************************************
+
+! set acceleration to zero
+  if(icall == 1) accel_crust_mantle(:,:) = 0._CUSTOM_REAL
+
+  computed_elements = 0
+
+  do ispec = 1,NSPEC_CRUST_MANTLE
+
+#ifdef USE_MPI
+! hide communications by computing the edges first
+    if((icall == 2 .and. is_on_a_slice_edge_crust_mantle(ispec)) .or. &
+       (icall == 1 .and. .not. is_on_a_slice_edge_crust_mantle(ispec))) cycle
+
+! process the communications every ELEMENTS_BETWEEN_NONBLOCKING elements
+    computed_elements = computed_elements + 1
+    if (USE_NONBLOCKING_COMMS .and. icall == 2 .and. mod(computed_elements,ELEMENTS_BETWEEN_NONBLOCKING) == 0) &
+         call assemble_MPI_vector(myrank,accel_crust_mantle,accel_inner_core, &
+            iproc_xi,iproc_eta,ichunk,addressing, &
+            iboolleft_xi_crust_mantle,iboolright_xi_crust_mantle,iboolleft_eta_crust_mantle,iboolright_eta_crust_mantle, &
+            npoin2D_faces_crust_mantle,npoin2D_xi_crust_mantle(1),npoin2D_eta_crust_mantle(1), &
+            iboolfaces_crust_mantle,iboolcorner_crust_mantle, &
+            iboolleft_xi_inner_core,iboolright_xi_inner_core,iboolleft_eta_inner_core,iboolright_eta_inner_core, &
+            npoin2D_faces_inner_core,npoin2D_xi_inner_core(1),npoin2D_eta_inner_core(1), &
+            iboolfaces_inner_core,iboolcorner_inner_core, &
+            iprocfrom_faces,iprocto_faces,imsg_type, &
+            iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
+            buffer_send_faces,buffer_received_faces,npoin2D_max_all, &
+            buffer_send_chunkcorners_vector,buffer_recv_chunkcorners_vector, &
+            NUMMSGS_FACES_VAL,NUM_MSG_TYPES,NCORNERSCHUNKS_VAL, &
+            NPROC_XI_VAL,NPROC_ETA_VAL,NGLOB1D_RADIAL_CM, &
+            NGLOB1D_RADIAL_IC,NCHUNKS_VAL,iphase)
+#endif
+
+    do k=1,NGLLZ
+      do j=1,NGLLY
+        do i=1,NGLLX
+
+          tempx1l = 0._CUSTOM_REAL
+          tempx2l = 0._CUSTOM_REAL
+          tempx3l = 0._CUSTOM_REAL
+
+          tempy1l = 0._CUSTOM_REAL
+          tempy2l = 0._CUSTOM_REAL
+          tempy3l = 0._CUSTOM_REAL
+
+          tempz1l = 0._CUSTOM_REAL
+          tempz2l = 0._CUSTOM_REAL
+          tempz3l = 0._CUSTOM_REAL
+
+          do l=1,NGLLX
+            hp1 = hprime_xx(i,l)
+            iglob = ibool_crust_mantle(l,j,k,ispec)
+            tempx1l = tempx1l + displ_crust_mantle(1,iglob)*hp1
+            tempy1l = tempy1l + displ_crust_mantle(2,iglob)*hp1
+            tempz1l = tempz1l + displ_crust_mantle(3,iglob)*hp1
+!!! can merge these loops because NGLLX = NGLLY = NGLLZ          enddo
+
+!!! can merge these loops because NGLLX = NGLLY = NGLLZ          do l=1,NGLLY
+            hp2 = hprime_yy(j,l)
+            iglob = ibool_crust_mantle(i,l,k,ispec)
+            tempx2l = tempx2l + displ_crust_mantle(1,iglob)*hp2
+            tempy2l = tempy2l + displ_crust_mantle(2,iglob)*hp2
+            tempz2l = tempz2l + displ_crust_mantle(3,iglob)*hp2
+!!! can merge these loops because NGLLX = NGLLY = NGLLZ          enddo
+
+!!! can merge these loops because NGLLX = NGLLY = NGLLZ          do l=1,NGLLZ
+            hp3 = hprime_zz(k,l)
+            iglob = ibool_crust_mantle(i,j,l,ispec)
+            tempx3l = tempx3l + displ_crust_mantle(1,iglob)*hp3
+            tempy3l = tempy3l + displ_crust_mantle(2,iglob)*hp3
+            tempz3l = tempz3l + displ_crust_mantle(3,iglob)*hp3
+          enddo
+
+!         get derivatives of ux, uy and uz with respect to x, y and z
+
+          xixl = xix_crust_mantle(i,j,k,ispec)
+          xiyl = xiy_crust_mantle(i,j,k,ispec)
+          xizl = xiz_crust_mantle(i,j,k,ispec)
+          etaxl = etax_crust_mantle(i,j,k,ispec)
+          etayl = etay_crust_mantle(i,j,k,ispec)
+          etazl = etaz_crust_mantle(i,j,k,ispec)
+          gammaxl = gammax_crust_mantle(i,j,k,ispec)
+          gammayl = gammay_crust_mantle(i,j,k,ispec)
+          gammazl = gammaz_crust_mantle(i,j,k,ispec)
+
+! compute the jacobian
+          jacobianl = 1._CUSTOM_REAL / (xixl*(etayl*gammazl-etazl*gammayl) &
+                        - xiyl*(etaxl*gammazl-etazl*gammaxl) &
+                        + xizl*(etaxl*gammayl-etayl*gammaxl))
+
+          duxdxl = xixl*tempx1l + etaxl*tempx2l + gammaxl*tempx3l
+          duxdyl = xiyl*tempx1l + etayl*tempx2l + gammayl*tempx3l
+          duxdzl = xizl*tempx1l + etazl*tempx2l + gammazl*tempx3l
+
+          duydxl = xixl*tempy1l + etaxl*tempy2l + gammaxl*tempy3l
+          duydyl = xiyl*tempy1l + etayl*tempy2l + gammayl*tempy3l
+          duydzl = xizl*tempy1l + etazl*tempy2l + gammazl*tempy3l
+
+          duzdxl = xixl*tempz1l + etaxl*tempz2l + gammaxl*tempz3l
+          duzdyl = xiyl*tempz1l + etayl*tempz2l + gammayl*tempz3l
+          duzdzl = xizl*tempz1l + etazl*tempz2l + gammazl*tempz3l
+
+! precompute some sums to save CPU time
+          duxdxl_plus_duydyl = duxdxl + duydyl
+          duxdxl_plus_duzdzl = duxdxl + duzdzl
+          duydyl_plus_duzdzl = duydyl + duzdzl
+          duxdyl_plus_duydxl = duxdyl + duydxl
+          duzdxl_plus_duxdzl = duzdxl + duxdzl
+          duzdyl_plus_duydzl = duzdyl + duydzl
+
+! compute deviatoric strain
+ if (COMPUTE_AND_STORE_STRAIN) then
+    epsilondev_loc(1,i,j,k) = duxdxl - ONE_THIRD * (duxdxl + duydyl + duzdzl)
+    epsilondev_loc(2,i,j,k) = duydyl - ONE_THIRD * (duxdxl + duydyl + duzdzl)
+    epsilondev_loc(3,i,j,k) = 0.5 * duxdyl_plus_duydxl
+    epsilondev_loc(4,i,j,k) = 0.5 * duzdxl_plus_duxdzl
+    epsilondev_loc(5,i,j,k) = 0.5 * duzdyl_plus_duydzl
+  endif
+
+! precompute terms for attenuation if needed
+  if(ATTENUATION_VAL) then
+    radius_cr = xstore_crust_mantle(ibool_crust_mantle(i,j,k,ispec))
+    call get_attenuation_index(idoubling_crust_mantle(ispec), dble(radius_cr), iregion_selected, .FALSE., AM_V)
+    one_minus_sum_beta_use = one_minus_sum_beta_crust_mantle(1,1,1,iregion_selected)
+    minus_sum_beta =  one_minus_sum_beta_use - 1.0
+  endif
+
+!
+! compute either isotropic or anisotropic elements
+!
+
+  if(ANISOTROPIC_3D_MANTLE_VAL) then
+
+  else
+
+! do not use transverse isotropy except if element is between d220 and Moho
+  if(.not. (TRANSVERSE_ISOTROPY_VAL .and. (idoubling_crust_mantle(ispec)==IFLAG_220_80 .or. &
+                                           idoubling_crust_mantle(ispec)==IFLAG_80_MOHO))) then
+
+! layer with no transverse isotropy, use kappav and muv
+          kappal = kappavstore_crust_mantle(i,j,k,ispec)
+          mul = muvstore_crust_mantle(i,j,k,ispec)
+
+! use unrelaxed parameters if attenuation
+    if(ATTENUATION_VAL) mul = mul * one_minus_sum_beta_use
+
+          lambdalplus2mul = kappal + FOUR_THIRDS * mul
+          lambdal = lambdalplus2mul - 2.*mul
+
+! compute stress sigma
+
+          sigma_xx = lambdalplus2mul*duxdxl + lambdal*duydyl_plus_duzdzl
+          sigma_yy = lambdalplus2mul*duydyl + lambdal*duxdxl_plus_duzdzl
+          sigma_zz = lambdalplus2mul*duzdzl + lambdal*duxdxl_plus_duydyl
+
+          sigma_xy = mul*duxdyl_plus_duydxl
+          sigma_xz = mul*duzdxl_plus_duxdzl
+          sigma_yz = mul*duzdyl_plus_duydzl
+
+    else
+
+! use Kappa and mu from transversely isotropic model
+      kappavl = kappavstore_crust_mantle(i,j,k,ispec)
+      muvl = muvstore_crust_mantle(i,j,k,ispec)
+
+      kappahl = kappahstore_crust_mantle(i,j,k,ispec)
+      muhl = muhstore_crust_mantle(i,j,k,ispec)
+
+! use unrelaxed parameters if attenuation
+! eta does not need to be shifted since it is a ratio
+  if(ATTENUATION_VAL) then
+    muvl = muvl * one_minus_sum_beta_use
+    muhl = muhl * one_minus_sum_beta_use
+  endif
+
+  rhovpvsq = kappavl + FOUR_THIRDS * muvl  !!! that is C
+  rhovphsq = kappahl + FOUR_THIRDS * muhl  !!! that is A
+
+  rhovsvsq = muvl  !!! that is L
+  rhovshsq = muhl  !!! that is N
+
+  eta_aniso = eta_anisostore_crust_mantle(i,j,k,ispec)  !!! that is  F / (A - 2 L)
+
+! use mesh coordinates to get theta and phi
+! ystore_crust_mantle and zstore_crust_mantle contain theta and phi
+
+  iglob = ibool_crust_mantle(i,j,k,ispec)
+  theta = ystore_crust_mantle(iglob)
+  phi = zstore_crust_mantle(iglob)
+
+  costheta = cos(theta)
+  sintheta = sin(theta)
+  cosphi = cos(phi)
+  sinphi = sin(phi)
+
+  costhetasq = costheta * costheta
+  sinthetasq = sintheta * sintheta
+  cosphisq = cosphi * cosphi
+  sinphisq = sinphi * sinphi
+
+  costhetafour = costhetasq * costhetasq
+  sinthetafour = sinthetasq * sinthetasq
+  cosphifour = cosphisq * cosphisq
+  sinphifour = sinphisq * sinphisq
+
+  costwotheta = cos(2.*theta)
+  sintwotheta = sin(2.*theta)
+  costwophi = cos(2.*phi)
+  sintwophi = sin(2.*phi)
+
+  cosfourtheta = cos(4.*theta)
+  cosfourphi = cos(4.*phi)
+
+  costwothetasq = costwotheta * costwotheta
+
+  costwophisq = costwophi * costwophi
+  sintwophisq = sintwophi * sintwophi
+
+  etaminone = eta_aniso - 1.
+  twoetaminone = 2. * eta_aniso - 1.
+
+! precompute some products to reduce the CPU time
+
+      two_eta_aniso = 2.*eta_aniso
+      four_eta_aniso = 4.*eta_aniso
+      six_eta_aniso = 6.*eta_aniso
+
+      two_rhovpvsq = 2.*rhovpvsq
+      two_rhovphsq = 2.*rhovphsq
+      two_rhovsvsq = 2.*rhovsvsq
+      two_rhovshsq = 2.*rhovshsq
+
+      four_rhovpvsq = 4.*rhovpvsq
+      four_rhovphsq = 4.*rhovphsq
+      four_rhovsvsq = 4.*rhovsvsq
+      four_rhovshsq = 4.*rhovshsq
+
+! the 21 anisotropic coefficients computed using Mathematica
+
+ c11 = rhovphsq*sinphifour + 2.*cosphisq*sinphisq* &
+   (rhovphsq*costhetasq + (eta_aniso*rhovphsq + two_rhovsvsq - two_eta_aniso*rhovsvsq)* &
+      sinthetasq) + cosphifour* &
+   (rhovphsq*costhetafour + 2.*(eta_aniso*rhovphsq + two_rhovsvsq - two_eta_aniso*rhovsvsq)* &
+      costhetasq*sinthetasq + rhovpvsq*sinthetafour)
+
+ c12 = ((rhovphsq - two_rhovshsq)*(3. + cosfourphi)*costhetasq)/4. - &
+  four_rhovshsq*cosphisq*costhetasq*sinphisq + &
+  (rhovphsq*(11. + 4.*costwotheta + cosfourtheta)*sintwophisq)/32. + &
+  eta_aniso*(rhovphsq - two_rhovsvsq)*(cosphifour + &
+     2.*cosphisq*costhetasq*sinphisq + sinphifour)*sinthetasq + &
+  rhovpvsq*cosphisq*sinphisq*sinthetafour - &
+  rhovsvsq*sintwophisq*sinthetafour
+
+ c13 = (cosphisq*(rhovphsq + six_eta_aniso*rhovphsq + rhovpvsq - four_rhovsvsq - &
+       12.*eta_aniso*rhovsvsq + (twoetaminone*rhovphsq - rhovpvsq + four_rhovsvsq - &
+          four_eta_aniso*rhovsvsq)*cosfourtheta))/8. + &
+  sinphisq*(eta_aniso*(rhovphsq - two_rhovsvsq)*costhetasq + &
+     (rhovphsq - two_rhovshsq)*sinthetasq)
+
+ c14 = costheta*sinphi*((cosphisq* &
+       (-rhovphsq + rhovpvsq + four_rhovshsq - four_rhovsvsq + &
+         (-rhovphsq + two_eta_aniso*rhovphsq - rhovpvsq + four_rhovsvsq - &
+            four_eta_aniso*rhovsvsq)*costwotheta))/2. + &
+    (etaminone*rhovphsq + 2.*(rhovshsq - eta_aniso*rhovsvsq))*sinphisq)* sintheta
+
+ c15 = cosphi*costheta*((cosphisq* (-rhovphsq + rhovpvsq + &
+         (twoetaminone*rhovphsq - rhovpvsq + four_rhovsvsq - four_eta_aniso*rhovsvsq)* &
+          costwotheta))/2. + etaminone*(rhovphsq - two_rhovsvsq)*sinphisq)*sintheta
+
+ c16 = (cosphi*sinphi*(cosphisq* (-rhovphsq + rhovpvsq + &
+         (-rhovphsq + two_eta_aniso*rhovphsq - rhovpvsq + four_rhovsvsq - &
+            four_eta_aniso*rhovsvsq)*costwotheta) + &
+      2.*etaminone*(rhovphsq - two_rhovsvsq)*sinphisq)*sinthetasq)/2.
+
+ c22 = rhovphsq*cosphifour + 2.*cosphisq*sinphisq* &
+   (rhovphsq*costhetasq + (eta_aniso*rhovphsq + two_rhovsvsq - two_eta_aniso*rhovsvsq)* &
+      sinthetasq) + sinphifour* &
+   (rhovphsq*costhetafour + 2.*(eta_aniso*rhovphsq + two_rhovsvsq - two_eta_aniso*rhovsvsq)* &
+      costhetasq*sinthetasq + rhovpvsq*sinthetafour)
+
+ c23 = ((rhovphsq + six_eta_aniso*rhovphsq + rhovpvsq - four_rhovsvsq - 12.*eta_aniso*rhovsvsq + &
+       (twoetaminone*rhovphsq - rhovpvsq + four_rhovsvsq - four_eta_aniso*rhovsvsq)* &
+        cosfourtheta)*sinphisq)/8. + &
+  cosphisq*(eta_aniso*(rhovphsq - two_rhovsvsq)*costhetasq + &
+     (rhovphsq - two_rhovshsq)*sinthetasq)
+
+ c24 = costheta*sinphi*(etaminone*(rhovphsq - two_rhovsvsq)*cosphisq + &
+    ((-rhovphsq + rhovpvsq + (twoetaminone*rhovphsq - rhovpvsq + &
+            four_rhovsvsq - four_eta_aniso*rhovsvsq)*costwotheta)*sinphisq)/2.)*sintheta
+
+ c25 = cosphi*costheta*((etaminone*rhovphsq + 2.*(rhovshsq - eta_aniso*rhovsvsq))* &
+     cosphisq + ((-rhovphsq + rhovpvsq + four_rhovshsq - four_rhovsvsq + &
+         (-rhovphsq + two_eta_aniso*rhovphsq - rhovpvsq + four_rhovsvsq - &
+            four_eta_aniso*rhovsvsq)*costwotheta)*sinphisq)/2.)*sintheta
+
+ c26 = (cosphi*sinphi*(2.*etaminone*(rhovphsq - two_rhovsvsq)*cosphisq + &
+      (-rhovphsq + rhovpvsq + (-rhovphsq + two_eta_aniso*rhovphsq - rhovpvsq + four_rhovsvsq - &
+            four_eta_aniso*rhovsvsq)*costwotheta)*sinphisq)*sinthetasq)/2.
+
+ c33 = rhovpvsq*costhetafour + 2.*(eta_aniso*(rhovphsq - two_rhovsvsq) + two_rhovsvsq)* &
+   costhetasq*sinthetasq + rhovphsq*sinthetafour
+
+ c34 = -((rhovphsq - rhovpvsq + (twoetaminone*rhovphsq - rhovpvsq + four_rhovsvsq &
+           - four_eta_aniso*rhovsvsq)*costwotheta)*sinphi*sintwotheta)/4.
+
+ c35 = -(cosphi*(rhovphsq - rhovpvsq + &
+       (twoetaminone*rhovphsq - rhovpvsq + four_rhovsvsq - four_eta_aniso*rhovsvsq)* &
+        costwotheta)*sintwotheta)/4.
+
+ c36 = -((rhovphsq - rhovpvsq - four_rhovshsq + four_rhovsvsq + &
+       (twoetaminone*rhovphsq - rhovpvsq + four_rhovsvsq - four_eta_aniso*rhovsvsq)* &
+        costwotheta)*sintwophi*sinthetasq)/4.
+
+ c44 = cosphisq*(rhovsvsq*costhetasq + rhovshsq*sinthetasq) + &
+  sinphisq*(rhovsvsq*costwothetasq + &
+     (rhovphsq - two_eta_aniso*rhovphsq + rhovpvsq + four_eta_aniso*rhovsvsq)*costhetasq* sinthetasq)
+
+ c45 = ((rhovphsq - two_eta_aniso*rhovphsq + rhovpvsq - two_rhovshsq - two_rhovsvsq + &
+      four_eta_aniso*rhovsvsq + (rhovphsq - two_eta_aniso*rhovphsq + rhovpvsq + &
+         4.*etaminone*rhovsvsq)*costwotheta)*sintwophi*sinthetasq)/4.
+
+ c46 = -(cosphi*costheta*((rhovshsq - rhovsvsq)*cosphisq - &
+      ((rhovphsq - two_eta_aniso*rhovphsq + rhovpvsq - two_rhovshsq - two_rhovsvsq + &
+           four_eta_aniso*rhovsvsq + (-rhovphsq + two_eta_aniso*rhovphsq - rhovpvsq + &
+              four_rhovsvsq - four_eta_aniso*rhovsvsq)*costwotheta)*sinphisq)/2.)* sintheta)
+
+ c55 = sinphisq*(rhovsvsq*costhetasq + rhovshsq*sinthetasq) + &
+  cosphisq*(rhovsvsq*costwothetasq + &
+     (rhovphsq - two_eta_aniso*rhovphsq + rhovpvsq + four_eta_aniso*rhovsvsq)*costhetasq* sinthetasq)
+
+ c56 = costheta*sinphi*((cosphisq* &
+       (rhovphsq - two_eta_aniso*rhovphsq + rhovpvsq - two_rhovshsq - two_rhovsvsq + &
+         four_eta_aniso*rhovsvsq + (-rhovphsq + two_eta_aniso*rhovphsq - rhovpvsq + &
+            four_rhovsvsq - four_eta_aniso*rhovsvsq)*costwotheta))/2. + &
+    (-rhovshsq + rhovsvsq)*sinphisq)*sintheta
+
+ c66 = rhovshsq*costwophisq*costhetasq - &
+  2.*(rhovphsq - two_rhovshsq)*cosphisq*costhetasq*sinphisq + &
+  (rhovphsq*(11. + 4.*costwotheta + cosfourtheta)*sintwophisq)/32. - &
+  (rhovsvsq*(-6. - 2.*cosfourphi + cos(4.*phi - 2.*theta) - 2.*costwotheta + &
+       cos(2.*(2.*phi + theta)))*sinthetasq)/8. + &
+  rhovpvsq*cosphisq*sinphisq*sinthetafour - &
+  (eta_aniso*(rhovphsq - two_rhovsvsq)*sintwophisq*sinthetafour)/2.
+
+! general expression of stress tensor for full Cijkl with 21 coefficients
+
+     sigma_xx = c11*duxdxl + c16*duxdyl_plus_duydxl + c12*duydyl + &
+               c15*duzdxl_plus_duxdzl + c14*duzdyl_plus_duydzl + c13*duzdzl
+
+     sigma_yy = c12*duxdxl + c26*duxdyl_plus_duydxl + c22*duydyl + &
+               c25*duzdxl_plus_duxdzl + c24*duzdyl_plus_duydzl + c23*duzdzl
+
+     sigma_zz = c13*duxdxl + c36*duxdyl_plus_duydxl + c23*duydyl + &
+               c35*duzdxl_plus_duxdzl + c34*duzdyl_plus_duydzl + c33*duzdzl
+
+     sigma_xy = c16*duxdxl + c66*duxdyl_plus_duydxl + c26*duydyl + &
+               c56*duzdxl_plus_duxdzl + c46*duzdyl_plus_duydzl + c36*duzdzl
+
+     sigma_xz = c15*duxdxl + c56*duxdyl_plus_duydxl + c25*duydyl + &
+               c55*duzdxl_plus_duxdzl + c45*duzdyl_plus_duydzl + c35*duzdzl
+
+     sigma_yz = c14*duxdxl + c46*duxdyl_plus_duydxl + c24*duydyl + &
+               c45*duzdxl_plus_duxdzl + c44*duzdyl_plus_duydzl + c34*duzdzl
+
+  endif
+
+  endif   ! end of test whether isotropic or anisotropic element
+
+! subtract memory variables if attenuation
+  if(ATTENUATION_VAL) then
+    do i_sls = 1,N_SLS
+      R_xx_val = R_memory_crust_mantle(1,i_sls,i,j,k,ispec)
+      R_yy_val = R_memory_crust_mantle(2,i_sls,i,j,k,ispec)
+      sigma_xx = sigma_xx - R_xx_val
+      sigma_yy = sigma_yy - R_yy_val
+      sigma_zz = sigma_zz + R_xx_val + R_yy_val
+      sigma_xy = sigma_xy - R_memory_crust_mantle(3,i_sls,i,j,k,ispec)
+      sigma_xz = sigma_xz - R_memory_crust_mantle(4,i_sls,i,j,k,ispec)
+      sigma_yz = sigma_yz - R_memory_crust_mantle(5,i_sls,i,j,k,ispec)
+    enddo
+  endif
+
+! form dot product with test vector, symmetric form
+      tempx1(i,j,k) = jacobianl * (sigma_xx*xixl + sigma_xy*xiyl + sigma_xz*xizl)
+      tempy1(i,j,k) = jacobianl * (sigma_xy*xixl + sigma_yy*xiyl + sigma_yz*xizl)
+      tempz1(i,j,k) = jacobianl * (sigma_xz*xixl + sigma_yz*xiyl + sigma_zz*xizl)
+
+      tempx2(i,j,k) = jacobianl * (sigma_xx*etaxl + sigma_xy*etayl + sigma_xz*etazl)
+      tempy2(i,j,k) = jacobianl * (sigma_xy*etaxl + sigma_yy*etayl + sigma_yz*etazl)
+      tempz2(i,j,k) = jacobianl * (sigma_xz*etaxl + sigma_yz*etayl + sigma_zz*etazl)
+
+      tempx3(i,j,k) = jacobianl * (sigma_xx*gammaxl + sigma_xy*gammayl + sigma_xz*gammazl)
+      tempy3(i,j,k) = jacobianl * (sigma_xy*gammaxl + sigma_yy*gammayl + sigma_yz*gammazl)
+      tempz3(i,j,k) = jacobianl * (sigma_xz*gammaxl + sigma_yz*gammayl + sigma_zz*gammazl)
+
+          enddo
+        enddo
+      enddo
+
+    do k=1,NGLLZ
+      do j=1,NGLLY
+        do i=1,NGLLX
+
+          tempx1l = 0._CUSTOM_REAL
+          tempy1l = 0._CUSTOM_REAL
+          tempz1l = 0._CUSTOM_REAL
+
+          tempx2l = 0._CUSTOM_REAL
+          tempy2l = 0._CUSTOM_REAL
+          tempz2l = 0._CUSTOM_REAL
+
+          tempx3l = 0._CUSTOM_REAL
+          tempy3l = 0._CUSTOM_REAL
+          tempz3l = 0._CUSTOM_REAL
+
+          do l=1,NGLLX
+            fac1 = hprimewgll_xx(l,i)
+            tempx1l = tempx1l + tempx1(l,j,k)*fac1
+            tempy1l = tempy1l + tempy1(l,j,k)*fac1
+            tempz1l = tempz1l + tempz1(l,j,k)*fac1
+!!! can merge these loops because NGLLX = NGLLY = NGLLZ          enddo
+
+!!! can merge these loops because NGLLX = NGLLY = NGLLZ          do l=1,NGLLY
+            fac2 = hprimewgll_yy(l,j)
+            tempx2l = tempx2l + tempx2(i,l,k)*fac2
+            tempy2l = tempy2l + tempy2(i,l,k)*fac2
+            tempz2l = tempz2l + tempz2(i,l,k)*fac2
+!!! can merge these loops because NGLLX = NGLLY = NGLLZ          enddo
+
+!!! can merge these loops because NGLLX = NGLLY = NGLLZ          do l=1,NGLLZ
+            fac3 = hprimewgll_zz(l,k)
+            tempx3l = tempx3l + tempx3(i,j,l)*fac3
+            tempy3l = tempy3l + tempy3(i,j,l)*fac3
+            tempz3l = tempz3l + tempz3(i,j,l)*fac3
+          enddo
+
+          fac1 = wgllwgll_yz(j,k)
+          fac2 = wgllwgll_xz(i,k)
+          fac3 = wgllwgll_xy(i,j)
+
+          sum_terms(1,i,j,k) = - (fac1*tempx1l + fac2*tempx2l + fac3*tempx3l)
+          sum_terms(2,i,j,k) = - (fac1*tempy1l + fac2*tempy2l + fac3*tempy3l)
+          sum_terms(3,i,j,k) = - (fac1*tempz1l + fac2*tempz2l + fac3*tempz3l)
+
+        enddo
+      enddo
+    enddo
+
+! sum contributions from each element to the global mesh and add gravity terms
+    do k=1,NGLLZ
+      do j=1,NGLLY
+        do i=1,NGLLX
+          iglob = ibool_crust_mantle(i,j,k,ispec)
+          accel_crust_mantle(1,iglob) = accel_crust_mantle(1,iglob) + sum_terms(1,i,j,k)
+          accel_crust_mantle(2,iglob) = accel_crust_mantle(2,iglob) + sum_terms(2,i,j,k)
+          accel_crust_mantle(3,iglob) = accel_crust_mantle(3,iglob) + sum_terms(3,i,j,k)
+        enddo
+      enddo
+    enddo
+
+! update memory variables based upon a Runge-Kutta scheme.
+! convention for attenuation:
+! term in xx = 1
+! term in yy = 2
+! term in xy = 3
+! term in xz = 4
+! term in yz = 5
+! term in zz not computed since zero trace
+    if(ATTENUATION_VAL) then
+      do k = 1,NGLLZ
+        do j = 1,NGLLY
+          do i = 1,NGLLX
+            do i_sls = 1,N_SLS
+              do i_memory = 1,5
+! get coefficients for that standard linear solid
+! IMPROVE we use mu_v here even if there is some anisotropy
+! IMPROVE we should probably use an average value instead
+                R_memory_crust_mantle(i_memory,i_sls,i,j,k,ispec) = alphaval(i_sls) * &
+                  R_memory_crust_mantle(i_memory,i_sls,i,j,k,ispec) + &
+                  factor_common_crust_mantle(i_sls,1,1,1,iregion_selected) * muvstore_crust_mantle(i,j,k,ispec) * &
+                  (betaval(i_sls) * epsilondev_crust_mantle(i_memory,i,j,k,ispec) + &
+                  gammaval(i_sls) * epsilondev_loc(i_memory,i,j,k))
+              enddo
+            enddo
+          enddo
+        enddo
+      enddo
+    endif
+
+! save deviatoric strain for Runge-Kutta scheme
+  if(COMPUTE_AND_STORE_STRAIN) epsilondev_crust_mantle(:,:,:,:,ispec) = epsilondev_loc(:,:,:,:)
+
+  enddo   ! spectral element loop
+
+!=====================================================================
+!=====================================================================
+!=====================================================================
+
+! ****************************************************
+!   big loop over all spectral elements in the solid
+! ****************************************************
+
+! set acceleration to zero
+  if(icall == 1) accel_inner_core(:,:) = 0._CUSTOM_REAL
+
+  computed_elements = 0
+
+  do ispec = 1,NSPEC_INNER_CORE
+
+#ifdef USE_MPI
+! hide communications by computing the edges first
+    if((icall == 2 .and. is_on_a_slice_edge_inner_core(ispec)) .or. &
+       (icall == 1 .and. .not. is_on_a_slice_edge_inner_core(ispec))) cycle
+#endif
+
+! exclude fictitious elements in central cube
+    if(idoubling_inner_core(ispec) == IFLAG_IN_FICTITIOUS_CUBE) cycle
+
+#ifdef USE_MPI
+! process the communications every ELEMENTS_BETWEEN_NONBLOCKING elements
+    computed_elements = computed_elements + 1
+    if (USE_NONBLOCKING_COMMS .and. icall == 2 .and. mod(computed_elements,ELEMENTS_BETWEEN_NONBLOCKING) == 0) &
+         call assemble_MPI_vector(myrank,accel_crust_mantle,accel_inner_core, &
+            iproc_xi,iproc_eta,ichunk,addressing, &
+            iboolleft_xi_crust_mantle,iboolright_xi_crust_mantle,iboolleft_eta_crust_mantle,iboolright_eta_crust_mantle, &
+            npoin2D_faces_crust_mantle,npoin2D_xi_crust_mantle(1),npoin2D_eta_crust_mantle(1), &
+            iboolfaces_crust_mantle,iboolcorner_crust_mantle, &
+            iboolleft_xi_inner_core,iboolright_xi_inner_core,iboolleft_eta_inner_core,iboolright_eta_inner_core, &
+            npoin2D_faces_inner_core,npoin2D_xi_inner_core(1),npoin2D_eta_inner_core(1), &
+            iboolfaces_inner_core,iboolcorner_inner_core, &
+            iprocfrom_faces,iprocto_faces,imsg_type, &
+            iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
+            buffer_send_faces,buffer_received_faces,npoin2D_max_all, &
+            buffer_send_chunkcorners_vector,buffer_recv_chunkcorners_vector, &
+            NUMMSGS_FACES_VAL,NUM_MSG_TYPES,NCORNERSCHUNKS_VAL, &
+            NPROC_XI_VAL,NPROC_ETA_VAL,NGLOB1D_RADIAL_CM, &
+            NGLOB1D_RADIAL_IC,NCHUNKS_VAL,iphase)
+#endif
+
+    do k=1,NGLLZ
+      do j=1,NGLLY
+        do i=1,NGLLX
+
+          tempx1l = 0._CUSTOM_REAL
+          tempx2l = 0._CUSTOM_REAL
+          tempx3l = 0._CUSTOM_REAL
+
+          tempy1l = 0._CUSTOM_REAL
+          tempy2l = 0._CUSTOM_REAL
+          tempy3l = 0._CUSTOM_REAL
+
+          tempz1l = 0._CUSTOM_REAL
+          tempz2l = 0._CUSTOM_REAL
+          tempz3l = 0._CUSTOM_REAL
+
+          do l=1,NGLLX
+            hp1 = hprime_xx(i,l)
+            iglob = ibool_inner_core(l,j,k,ispec)
+            tempx1l = tempx1l + displ_inner_core(1,iglob)*hp1
+            tempy1l = tempy1l + displ_inner_core(2,iglob)*hp1
+            tempz1l = tempz1l + displ_inner_core(3,iglob)*hp1
+!!! can merge these loops because NGLLX = NGLLY = NGLLZ          enddo
+
+!!! can merge these loops because NGLLX = NGLLY = NGLLZ          do l=1,NGLLY
+            hp2 = hprime_yy(j,l)
+            iglob = ibool_inner_core(i,l,k,ispec)
+            tempx2l = tempx2l + displ_inner_core(1,iglob)*hp2
+            tempy2l = tempy2l + displ_inner_core(2,iglob)*hp2
+            tempz2l = tempz2l + displ_inner_core(3,iglob)*hp2
+!!! can merge these loops because NGLLX = NGLLY = NGLLZ          enddo
+
+!!! can merge these loops because NGLLX = NGLLY = NGLLZ          do l=1,NGLLZ
+            hp3 = hprime_zz(k,l)
+            iglob = ibool_inner_core(i,j,l,ispec)
+            tempx3l = tempx3l + displ_inner_core(1,iglob)*hp3
+            tempy3l = tempy3l + displ_inner_core(2,iglob)*hp3
+            tempz3l = tempz3l + displ_inner_core(3,iglob)*hp3
+          enddo
+
+!         get derivatives of ux, uy and uz with respect to x, y and z
+
+          xixl = xix_inner_core(i,j,k,ispec)
+          xiyl = xiy_inner_core(i,j,k,ispec)
+          xizl = xiz_inner_core(i,j,k,ispec)
+          etaxl = etax_inner_core(i,j,k,ispec)
+          etayl = etay_inner_core(i,j,k,ispec)
+          etazl = etaz_inner_core(i,j,k,ispec)
+          gammaxl = gammax_inner_core(i,j,k,ispec)
+          gammayl = gammay_inner_core(i,j,k,ispec)
+          gammazl = gammaz_inner_core(i,j,k,ispec)
+
+! compute the jacobian
+          jacobianl = 1._CUSTOM_REAL / (xixl*(etayl*gammazl-etazl*gammayl) &
+                        - xiyl*(etaxl*gammazl-etazl*gammaxl) &
+                        + xizl*(etaxl*gammayl-etayl*gammaxl))
+
+          duxdxl = xixl*tempx1l + etaxl*tempx2l + gammaxl*tempx3l
+          duxdyl = xiyl*tempx1l + etayl*tempx2l + gammayl*tempx3l
+          duxdzl = xizl*tempx1l + etazl*tempx2l + gammazl*tempx3l
+
+          duydxl = xixl*tempy1l + etaxl*tempy2l + gammaxl*tempy3l
+          duydyl = xiyl*tempy1l + etayl*tempy2l + gammayl*tempy3l
+          duydzl = xizl*tempy1l + etazl*tempy2l + gammazl*tempy3l
+
+          duzdxl = xixl*tempz1l + etaxl*tempz2l + gammaxl*tempz3l
+          duzdyl = xiyl*tempz1l + etayl*tempz2l + gammayl*tempz3l
+          duzdzl = xizl*tempz1l + etazl*tempz2l + gammazl*tempz3l
+
+! precompute some sums to save CPU time
+          duxdxl_plus_duydyl = duxdxl + duydyl
+          duxdxl_plus_duzdzl = duxdxl + duzdzl
+          duydyl_plus_duzdzl = duydyl + duzdzl
+          duxdyl_plus_duydxl = duxdyl + duydxl
+          duzdxl_plus_duxdzl = duzdxl + duxdzl
+          duzdyl_plus_duydzl = duzdyl + duydzl
+
+! compute deviatoric strain
+  if (COMPUTE_AND_STORE_STRAIN) then
+    epsilondev_loc(1,i,j,k) = duxdxl - ONE_THIRD * (duxdxl + duydyl + duzdzl)
+    epsilondev_loc(2,i,j,k) = duydyl - ONE_THIRD * (duxdxl + duydyl + duzdzl)
+    epsilondev_loc(3,i,j,k) = 0.5 * duxdyl_plus_duydxl
+    epsilondev_loc(4,i,j,k) = 0.5 * duzdxl_plus_duxdzl
+    epsilondev_loc(5,i,j,k) = 0.5 * duzdyl_plus_duydzl
+  endif
+
+  if(ATTENUATION_VAL) then
+! same attenuation everywhere in the inner core therefore no need to use Brian's routines
+!!!!!        radius_cr = xstore(ibool_inner_core(i,j,k,ispec))
+!!!!!        call get_attenuation_index(idoubling_inner_core(ispec), dble(radius_cr), iregion_selected_inner_core, .TRUE., AM_V)
+        minus_sum_beta =  one_minus_sum_beta_inner_core(1,1,1,iregion_selected_inner_core) - 1.0
+  endif ! ATTENUATION_VAL
+
+       if(ANISOTROPIC_INNER_CORE_VAL) then
+
+       else
+
+! inner core with no anisotropy, use kappav and muv for instance
+! layer with no anisotropy, use kappav and muv for instance
+          kappal = kappavstore_inner_core(i,j,k,ispec)
+          mul = muvstore_inner_core(i,j,k,ispec)
+
+! use unrelaxed parameters if attenuation
+  if(ATTENUATION_VAL) then
+      mul = mul * one_minus_sum_beta_inner_core(1,1,1,iregion_selected_inner_core)
+  endif
+
+          lambdalplus2mul = kappal + FOUR_THIRDS * mul
+          lambdal = lambdalplus2mul - 2.*mul
+
+! compute stress sigma
+
+          sigma_xx = lambdalplus2mul*duxdxl + lambdal*duydyl_plus_duzdzl
+          sigma_yy = lambdalplus2mul*duydyl + lambdal*duxdxl_plus_duzdzl
+          sigma_zz = lambdalplus2mul*duzdzl + lambdal*duxdxl_plus_duydyl
+
+          sigma_xy = mul*duxdyl_plus_duydxl
+          sigma_xz = mul*duzdxl_plus_duxdzl
+          sigma_yz = mul*duzdyl_plus_duydzl
+
+        endif
+
+! subtract memory variables if attenuation
+  if(ATTENUATION_VAL) then
+    do i_sls = 1,N_SLS
+      R_xx_val = R_memory_inner_core(1,i_sls,i,j,k,ispec)
+      R_yy_val = R_memory_inner_core(2,i_sls,i,j,k,ispec)
+      sigma_xx = sigma_xx - R_xx_val
+      sigma_yy = sigma_yy - R_yy_val
+      sigma_zz = sigma_zz + R_xx_val + R_yy_val
+      sigma_xy = sigma_xy - R_memory_inner_core(3,i_sls,i,j,k,ispec)
+      sigma_xz = sigma_xz - R_memory_inner_core(4,i_sls,i,j,k,ispec)
+      sigma_yz = sigma_yz - R_memory_inner_core(5,i_sls,i,j,k,ispec)
+    enddo
+  endif
+
+! form dot product with test vector, symmetric form
+      tempx1(i,j,k) = jacobianl * (sigma_xx*xixl + sigma_xy*xiyl + sigma_xz*xizl)
+      tempy1(i,j,k) = jacobianl * (sigma_xy*xixl + sigma_yy*xiyl + sigma_yz*xizl)
+      tempz1(i,j,k) = jacobianl * (sigma_xz*xixl + sigma_yz*xiyl + sigma_zz*xizl)
+
+      tempx2(i,j,k) = jacobianl * (sigma_xx*etaxl + sigma_xy*etayl + sigma_xz*etazl)
+      tempy2(i,j,k) = jacobianl * (sigma_xy*etaxl + sigma_yy*etayl + sigma_yz*etazl)
+      tempz2(i,j,k) = jacobianl * (sigma_xz*etaxl + sigma_yz*etayl + sigma_zz*etazl)
+
+      tempx3(i,j,k) = jacobianl * (sigma_xx*gammaxl + sigma_xy*gammayl + sigma_xz*gammazl)
+      tempy3(i,j,k) = jacobianl * (sigma_xy*gammaxl + sigma_yy*gammayl + sigma_yz*gammazl)
+      tempz3(i,j,k) = jacobianl * (sigma_xz*gammaxl + sigma_yz*gammayl + sigma_zz*gammazl)
+
+          enddo
+        enddo
+      enddo
+
+    do k=1,NGLLZ
+      do j=1,NGLLY
+        do i=1,NGLLX
+
+          tempx1l = 0._CUSTOM_REAL
+          tempy1l = 0._CUSTOM_REAL
+          tempz1l = 0._CUSTOM_REAL
+
+          tempx2l = 0._CUSTOM_REAL
+          tempy2l = 0._CUSTOM_REAL
+          tempz2l = 0._CUSTOM_REAL
+
+          tempx3l = 0._CUSTOM_REAL
+          tempy3l = 0._CUSTOM_REAL
+          tempz3l = 0._CUSTOM_REAL
+
+          do l=1,NGLLX
+            fac1 = hprimewgll_xx(l,i)
+            tempx1l = tempx1l + tempx1(l,j,k)*fac1
+            tempy1l = tempy1l + tempy1(l,j,k)*fac1
+            tempz1l = tempz1l + tempz1(l,j,k)*fac1
+!!! can merge these loops because NGLLX = NGLLY = NGLLZ          enddo
+
+!!! can merge these loops because NGLLX = NGLLY = NGLLZ          do l=1,NGLLY
+            fac2 = hprimewgll_yy(l,j)
+            tempx2l = tempx2l + tempx2(i,l,k)*fac2
+            tempy2l = tempy2l + tempy2(i,l,k)*fac2
+            tempz2l = tempz2l + tempz2(i,l,k)*fac2
+!!! can merge these loops because NGLLX = NGLLY = NGLLZ          enddo
+
+!!! can merge these loops because NGLLX = NGLLY = NGLLZ          do l=1,NGLLZ
+            fac3 = hprimewgll_zz(l,k)
+            tempx3l = tempx3l + tempx3(i,j,l)*fac3
+            tempy3l = tempy3l + tempy3(i,j,l)*fac3
+            tempz3l = tempz3l + tempz3(i,j,l)*fac3
+          enddo
+
+          fac1 = wgllwgll_yz(j,k)
+          fac2 = wgllwgll_xz(i,k)
+          fac3 = wgllwgll_xy(i,j)
+
+          sum_terms(1,i,j,k) = - (fac1*tempx1l + fac2*tempx2l + fac3*tempx3l)
+          sum_terms(2,i,j,k) = - (fac1*tempy1l + fac2*tempy2l + fac3*tempy3l)
+          sum_terms(3,i,j,k) = - (fac1*tempz1l + fac2*tempz2l + fac3*tempz3l)
+
+        enddo
+      enddo
+    enddo
+
+! sum contributions from each element to the global mesh and add gravity terms
+    do k=1,NGLLZ
+      do j=1,NGLLY
+        do i=1,NGLLX
+          iglob = ibool_inner_core(i,j,k,ispec)
+          accel_inner_core(:,iglob) = accel_inner_core(:,iglob) + sum_terms(:,i,j,k)
+        enddo
+      enddo
+    enddo
+
+! update memory variables based upon a Runge-Kutta scheme.
+! convention for attenuation:
+! term in xx = 1
+! term in yy = 2
+! term in xy = 3
+! term in xz = 4
+! term in yz = 5
+! term in zz not computed since zero trace
+    if(ATTENUATION_VAL) then
+      do k = 1,NGLLZ
+        do j = 1,NGLLY
+          do i = 1,NGLLX
+            do i_sls = 1,N_SLS
+              do i_memory = 1,5
+                R_memory_inner_core(i_memory,i_sls,i,j,k,ispec) = &
+                  alphaval(i_sls) * &
+                  R_memory_inner_core(i_memory,i_sls,i,j,k,ispec) + muvstore_inner_core(i,j,k,ispec) * &
+                  factor_common_inner_core(i_sls,1,1,1,iregion_selected_inner_core) * &
+                  (betaval(i_sls) * &
+                epsilondev_inner_core(i_memory,i,j,k,ispec) + gammaval(i_sls) * epsilondev_loc(i_memory,i,j,k))
+              enddo
+            enddo
+          enddo
+        enddo
+      enddo
+    endif
+
+! save deviatoric strain for Runge-Kutta scheme
+    if(COMPUTE_AND_STORE_STRAIN) epsilondev_inner_core(:,:,:,:,ispec) = epsilondev_loc(:,:,:,:)
+
+  enddo ! spectral element loop
+
+  end subroutine compute_forces_CM_IC
+

Deleted: seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/compute_forces_CM_IC.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/compute_forces_CM_IC.f90	2008-09-17 17:33:29 UTC (rev 12909)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/compute_forces_CM_IC.f90	2008-09-17 18:17:25 UTC (rev 12910)
@@ -1,959 +0,0 @@
-!=====================================================================
-!
-!          S p e c f e m 3 D  G l o b e  V e r s i o n  4 . 1
-!          --------------------------------------------------
-!
-!          Main authors: Dimitri Komatitsch and Jeroen Tromp
-!    Seismological Laboratory, California Institute of Technology, USA
-!             and University of Pau / CNRS / INRIA, France
-! (c) California Institute of Technology and University of Pau / CNRS / INRIA
-!                            August 2008
-!
-! This program is free software; you can redistribute it and/or modify
-! it under the terms of the GNU General Public License as published by
-! the Free Software Foundation; either version 2 of the License, or
-! (at your option) any later version.
-!
-! This program is distributed in the hope that it will be useful,
-! but WITHOUT ANY WARRANTY; without even the implied warranty of
-! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-! GNU General Public License for more details.
-!
-! You should have received a copy of the GNU General Public License along
-! with this program; if not, write to the Free Software Foundation, Inc.,
-! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-!
-!=====================================================================
-
-! compute forces in the crust/mantle and in the inner core (i.e., all the solid regions)
-  subroutine compute_forces_CM_IC(displ_crust_mantle,accel_crust_mantle, &
-          xstore_crust_mantle,ystore_crust_mantle,zstore_crust_mantle, &
-          xix_crust_mantle,xiy_crust_mantle,xiz_crust_mantle,etax_crust_mantle,etay_crust_mantle,etaz_crust_mantle, &
-          gammax_crust_mantle,gammay_crust_mantle,gammaz_crust_mantle, &
-          kappavstore_crust_mantle,kappahstore_crust_mantle,muvstore_crust_mantle,muhstore_crust_mantle, &
-          eta_anisostore_crust_mantle, &
-          ibool_crust_mantle,idoubling_crust_mantle,R_memory_crust_mantle,epsilondev_crust_mantle,one_minus_sum_beta_crust_mantle, &
-          factor_common_crust_mantle,vx_crust_mantle,vy_crust_mantle,vz_crust_mantle,vnspec_crust_mantle, &
-          is_on_a_slice_edge_crust_mantle, &
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-          displ_inner_core,accel_inner_core, &
-          xix_inner_core,xiy_inner_core,xiz_inner_core,etax_inner_core,etay_inner_core,etaz_inner_core, &
-          gammax_inner_core,gammay_inner_core,gammaz_inner_core, &
-          kappavstore_inner_core,muvstore_inner_core,ibool_inner_core,idoubling_inner_core, &
-          R_memory_inner_core,epsilondev_inner_core,&
-          one_minus_sum_beta_inner_core,factor_common_inner_core, &
-          vx_inner_core,vy_inner_core,vz_inner_core,vnspec_inner_core,is_on_a_slice_edge_inner_core, &
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-          hprime_xx,hprime_yy,hprime_zz, &
-          hprimewgll_xx,hprimewgll_yy,hprimewgll_zz, &
-          wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, &
-          alphaval,betaval,gammaval, &
-          COMPUTE_AND_STORE_STRAIN,AM_V,icall)
-
-  implicit none
-
-  include "constants.h"
-
-! include values created by the mesher
-! done for performance only using static allocation to allow for loop unrolling
-  include "values_from_mesher.h"
-
-  integer :: icall
-
-  logical, dimension(NSPEC_CRUST_MANTLE) :: is_on_a_slice_edge_crust_mantle
-
-! attenuation_model_variables
-  type attenuation_model_variables
-    sequence
-    double precision min_period, max_period
-    double precision                          :: QT_c_source        ! Source Frequency
-    double precision, dimension(N_SLS)        :: Qtau_s             ! tau_sigma
-    double precision, dimension(:), pointer   :: QrDisc             ! Discontinutitues Defined
-    double precision, dimension(:), pointer   :: Qr                 ! Radius
-    integer, dimension(:), pointer            :: interval_Q                 ! Steps
-    double precision, dimension(:), pointer   :: Qmu                ! Shear Attenuation
-    double precision, dimension(:,:), pointer :: Qtau_e             ! tau_epsilon
-    double precision, dimension(:), pointer   :: Qone_minus_sum_beta, Qone_minus_sum_beta2      ! one_minus_sum_beta
-    double precision, dimension(:,:), pointer :: Qfc, Qfc2          ! factor_common
-    double precision, dimension(:), pointer   :: Qsf, Qsf2          ! scale_factor
-    integer, dimension(:), pointer            :: Qrmin              ! Max and Mins of idoubling
-    integer, dimension(:), pointer            :: Qrmax              ! Max and Mins of idoubling
-    integer                                   :: Qn                 ! Number of points
-  end type attenuation_model_variables
-
-  type (attenuation_model_variables) AM_V
-! attenuation_model_variables
-
-! for forward or backward simulations
-  logical COMPUTE_AND_STORE_STRAIN
-
-! array with the local to global mapping per slice
-  integer, dimension(NSPEC_CRUST_MANTLE) :: idoubling_crust_mantle
-
-! displacement and acceleration
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_CRUST_MANTLE) :: displ_crust_mantle,accel_crust_mantle
-
-! memory variables for attenuation
-! memory variables R_ij are stored at the local rather than global level
-! to allow for optimization of cache access by compiler
-  integer i_sls,i_memory
-! variable sized array variables for one_minus_sum_beta and factor_common
-  integer vx_crust_mantle,vy_crust_mantle,vz_crust_mantle,vnspec_crust_mantle
-
-  real(kind=CUSTOM_REAL) one_minus_sum_beta_use,minus_sum_beta
-  real(kind=CUSTOM_REAL), dimension(vx_crust_mantle,vy_crust_mantle,vz_crust_mantle,vnspec_crust_mantle) :: &
-       one_minus_sum_beta_crust_mantle
-
-  integer iregion_selected
-
-! for attenuation
-  real(kind=CUSTOM_REAL) R_xx_val,R_yy_val
-  real(kind=CUSTOM_REAL), dimension(5,N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_ATTENUAT) :: R_memory_crust_mantle
-  real(kind=CUSTOM_REAL), dimension(5,NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE) :: epsilondev_crust_mantle
-
-! [alpha,beta,gamma]val reduced to N_SLS and factor_common to N_SLS*NUM_NODES
-  real(kind=CUSTOM_REAL), dimension(N_SLS) :: alphaval,betaval,gammaval
-  real(kind=CUSTOM_REAL), dimension(N_SLS,vx_crust_mantle,vy_crust_mantle,vz_crust_mantle,vnspec_crust_mantle) :: &
-       factor_common_crust_mantle
-
-  real(kind=CUSTOM_REAL), dimension(5,NGLLX,NGLLY,NGLLZ) :: epsilondev_loc
-
-! arrays with mesh parameters per slice
-  integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE) :: ibool_crust_mantle
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE) :: &
-        xix_crust_mantle,xiy_crust_mantle,xiz_crust_mantle,etax_crust_mantle,etay_crust_mantle,etaz_crust_mantle, &
-        gammax_crust_mantle,gammay_crust_mantle,gammaz_crust_mantle
-
-! array with derivatives of Lagrange polynomials and precalculated products
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLX) :: hprime_xx,hprimewgll_xx
-  real(kind=CUSTOM_REAL), dimension(NGLLY,NGLLY) :: hprime_yy,hprimewgll_yy
-  real(kind=CUSTOM_REAL), dimension(NGLLZ,NGLLZ) :: hprime_zz,hprimewgll_zz
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY) :: wgllwgll_xy
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLZ) :: wgllwgll_xz
-  real(kind=CUSTOM_REAL), dimension(NGLLY,NGLLZ) :: wgllwgll_yz
-
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: &
-    tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3
-
-! x y and z contain r theta and phi
-  real(kind=CUSTOM_REAL), dimension(NGLOB_CRUST_MANTLE) :: xstore_crust_mantle,ystore_crust_mantle,zstore_crust_mantle
-
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPECMAX_ISO_MANTLE) :: &
-        kappavstore_crust_mantle,muvstore_crust_mantle
-
-! store anisotropic properties only where needed to save memory
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPECMAX_TISO_MANTLE) :: &
-        kappahstore_crust_mantle,muhstore_crust_mantle,eta_anisostore_crust_mantle
-
-  integer ispec,iglob
-  integer i,j,k,l
-
-! the 21 coefficients for an anisotropic medium in reduced notation
-  real(kind=CUSTOM_REAL) c11,c22,c33,c44,c55,c66,c12,c13,c23,c14,c24,c34,c15,c25,c35,c45,c16,c26,c36,c46,c56
-
-  real(kind=CUSTOM_REAL) rhovphsq,sinphifour,cosphisq,sinphisq,costhetasq,rhovsvsq,sinthetasq, &
-        cosphifour,costhetafour,rhovpvsq,sinthetafour,rhovshsq,cosfourphi, &
-        costwotheta,cosfourtheta,sintwophisq,costheta,sinphi,sintheta,cosphi, &
-        sintwotheta,costwophi,sintwophi,costwothetasq,costwophisq,phi,theta
-
-  real(kind=CUSTOM_REAL) two_rhovpvsq,two_rhovphsq,two_rhovsvsq,two_rhovshsq
-  real(kind=CUSTOM_REAL) four_rhovpvsq,four_rhovphsq,four_rhovsvsq,four_rhovshsq
-
-  real(kind=CUSTOM_REAL) twoetaminone,etaminone,eta_aniso
-  real(kind=CUSTOM_REAL) two_eta_aniso,four_eta_aniso,six_eta_aniso
-
-  real(kind=CUSTOM_REAL) xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl
-  real(kind=CUSTOM_REAL) duxdxl,duxdyl,duxdzl,duydxl,duydyl,duydzl,duzdxl,duzdyl,duzdzl
-
-  real(kind=CUSTOM_REAL) duxdxl_plus_duydyl,duxdxl_plus_duzdzl,duydyl_plus_duzdzl
-  real(kind=CUSTOM_REAL) duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl
-
-  real(kind=CUSTOM_REAL) sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz
-
-  real(kind=CUSTOM_REAL) hp1,hp2,hp3
-  real(kind=CUSTOM_REAL) fac1,fac2,fac3
-  real(kind=CUSTOM_REAL) lambdal,mul,lambdalplus2mul
-  real(kind=CUSTOM_REAL) kappal,kappavl,kappahl,muvl,muhl
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLLX,NGLLY,NGLLZ) :: sum_terms
-
-  real(kind=CUSTOM_REAL) tempx1l,tempx2l,tempx3l
-  real(kind=CUSTOM_REAL) tempy1l,tempy2l,tempy3l
-  real(kind=CUSTOM_REAL) tempz1l,tempz2l,tempz3l
-
-  real(kind=CUSTOM_REAL) radius_cr
-
-!=====================================================================
-!=====================================================================
-!=====================================================================
-
-  logical, dimension(NSPEC_OUTER_CORE) :: is_on_a_slice_edge_inner_core
-
-! same attenuation everywhere in the inner core therefore no need to use Brian's routines
-  integer, parameter :: iregion_selected_inner_core = 1
-
-! displacement and acceleration
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_INNER_CORE) :: displ_inner_core,accel_inner_core
-
-! variable lengths for factor_common and one_minus_sum_beta
-  integer vx_inner_core, vy_inner_core, vz_inner_core, vnspec_inner_core
-
-  real(kind=CUSTOM_REAL), dimension(vx_inner_core,vy_inner_core,vz_inner_core,vnspec_inner_core) :: one_minus_sum_beta_inner_core
-
-  real(kind=CUSTOM_REAL), dimension(N_SLS,vx_inner_core,vy_inner_core,vz_inner_core,vnspec_inner_core) :: factor_common_inner_core
-
-  real(kind=CUSTOM_REAL), dimension(5,N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE_ATTENUATION) :: R_memory_inner_core
-  real(kind=CUSTOM_REAL), dimension(5,NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE) :: epsilondev_inner_core
-
-! array with the local to global mapping per slice
-  integer, dimension(NSPEC_INNER_CORE) :: idoubling_inner_core
-
-! arrays with mesh parameters per slice
-  integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE) :: ibool_inner_core
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE) :: &
-       xix_inner_core,xiy_inner_core,xiz_inner_core,etax_inner_core,etay_inner_core,etaz_inner_core,&
-       gammax_inner_core,gammay_inner_core,gammaz_inner_core
-
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE) :: kappavstore_inner_core,muvstore_inner_core
-
-! ****************************************************
-!   big loop over all spectral elements in the solid
-! ****************************************************
-
-! set acceleration to zero
-  if(icall == 1) accel_crust_mantle(:,:) = 0._CUSTOM_REAL
-
-  do ispec = 1,NSPEC_CRUST_MANTLE
-
-! hide communications by computing the edges first
-    if((icall == 2 .and. is_on_a_slice_edge_crust_mantle(ispec)) .or. &
-       (icall == 1 .and. .not. is_on_a_slice_edge_crust_mantle(ispec))) cycle
-
-    do k=1,NGLLZ
-      do j=1,NGLLY
-        do i=1,NGLLX
-
-          tempx1l = 0._CUSTOM_REAL
-          tempx2l = 0._CUSTOM_REAL
-          tempx3l = 0._CUSTOM_REAL
-
-          tempy1l = 0._CUSTOM_REAL
-          tempy2l = 0._CUSTOM_REAL
-          tempy3l = 0._CUSTOM_REAL
-
-          tempz1l = 0._CUSTOM_REAL
-          tempz2l = 0._CUSTOM_REAL
-          tempz3l = 0._CUSTOM_REAL
-
-          do l=1,NGLLX
-            hp1 = hprime_xx(i,l)
-            iglob = ibool_crust_mantle(l,j,k,ispec)
-            tempx1l = tempx1l + displ_crust_mantle(1,iglob)*hp1
-            tempy1l = tempy1l + displ_crust_mantle(2,iglob)*hp1
-            tempz1l = tempz1l + displ_crust_mantle(3,iglob)*hp1
-!!! can merge these loops because NGLLX = NGLLY = NGLLZ          enddo
-
-!!! can merge these loops because NGLLX = NGLLY = NGLLZ          do l=1,NGLLY
-            hp2 = hprime_yy(j,l)
-            iglob = ibool_crust_mantle(i,l,k,ispec)
-            tempx2l = tempx2l + displ_crust_mantle(1,iglob)*hp2
-            tempy2l = tempy2l + displ_crust_mantle(2,iglob)*hp2
-            tempz2l = tempz2l + displ_crust_mantle(3,iglob)*hp2
-!!! can merge these loops because NGLLX = NGLLY = NGLLZ          enddo
-
-!!! can merge these loops because NGLLX = NGLLY = NGLLZ          do l=1,NGLLZ
-            hp3 = hprime_zz(k,l)
-            iglob = ibool_crust_mantle(i,j,l,ispec)
-            tempx3l = tempx3l + displ_crust_mantle(1,iglob)*hp3
-            tempy3l = tempy3l + displ_crust_mantle(2,iglob)*hp3
-            tempz3l = tempz3l + displ_crust_mantle(3,iglob)*hp3
-          enddo
-
-!         get derivatives of ux, uy and uz with respect to x, y and z
-
-          xixl = xix_crust_mantle(i,j,k,ispec)
-          xiyl = xiy_crust_mantle(i,j,k,ispec)
-          xizl = xiz_crust_mantle(i,j,k,ispec)
-          etaxl = etax_crust_mantle(i,j,k,ispec)
-          etayl = etay_crust_mantle(i,j,k,ispec)
-          etazl = etaz_crust_mantle(i,j,k,ispec)
-          gammaxl = gammax_crust_mantle(i,j,k,ispec)
-          gammayl = gammay_crust_mantle(i,j,k,ispec)
-          gammazl = gammaz_crust_mantle(i,j,k,ispec)
-
-! compute the jacobian
-          jacobianl = 1._CUSTOM_REAL / (xixl*(etayl*gammazl-etazl*gammayl) &
-                        - xiyl*(etaxl*gammazl-etazl*gammaxl) &
-                        + xizl*(etaxl*gammayl-etayl*gammaxl))
-
-          duxdxl = xixl*tempx1l + etaxl*tempx2l + gammaxl*tempx3l
-          duxdyl = xiyl*tempx1l + etayl*tempx2l + gammayl*tempx3l
-          duxdzl = xizl*tempx1l + etazl*tempx2l + gammazl*tempx3l
-
-          duydxl = xixl*tempy1l + etaxl*tempy2l + gammaxl*tempy3l
-          duydyl = xiyl*tempy1l + etayl*tempy2l + gammayl*tempy3l
-          duydzl = xizl*tempy1l + etazl*tempy2l + gammazl*tempy3l
-
-          duzdxl = xixl*tempz1l + etaxl*tempz2l + gammaxl*tempz3l
-          duzdyl = xiyl*tempz1l + etayl*tempz2l + gammayl*tempz3l
-          duzdzl = xizl*tempz1l + etazl*tempz2l + gammazl*tempz3l
-
-! precompute some sums to save CPU time
-          duxdxl_plus_duydyl = duxdxl + duydyl
-          duxdxl_plus_duzdzl = duxdxl + duzdzl
-          duydyl_plus_duzdzl = duydyl + duzdzl
-          duxdyl_plus_duydxl = duxdyl + duydxl
-          duzdxl_plus_duxdzl = duzdxl + duxdzl
-          duzdyl_plus_duydzl = duzdyl + duydzl
-
-! compute deviatoric strain
- if (COMPUTE_AND_STORE_STRAIN) then
-    epsilondev_loc(1,i,j,k) = duxdxl - ONE_THIRD * (duxdxl + duydyl + duzdzl)
-    epsilondev_loc(2,i,j,k) = duydyl - ONE_THIRD * (duxdxl + duydyl + duzdzl)
-    epsilondev_loc(3,i,j,k) = 0.5 * duxdyl_plus_duydxl
-    epsilondev_loc(4,i,j,k) = 0.5 * duzdxl_plus_duxdzl
-    epsilondev_loc(5,i,j,k) = 0.5 * duzdyl_plus_duydzl
-  endif
-
-! precompute terms for attenuation if needed
-  if(ATTENUATION_VAL) then
-    radius_cr = xstore_crust_mantle(ibool_crust_mantle(i,j,k,ispec))
-    call get_attenuation_index(idoubling_crust_mantle(ispec), dble(radius_cr), iregion_selected, .FALSE., AM_V)
-    one_minus_sum_beta_use = one_minus_sum_beta_crust_mantle(1,1,1,iregion_selected)
-    minus_sum_beta =  one_minus_sum_beta_use - 1.0
-  endif
-
-!
-! compute either isotropic or anisotropic elements
-!
-
-  if(ANISOTROPIC_3D_MANTLE_VAL) then
-
-  else
-
-! do not use transverse isotropy except if element is between d220 and Moho
-  if(.not. (TRANSVERSE_ISOTROPY_VAL .and. (idoubling_crust_mantle(ispec)==IFLAG_220_80 .or. &
-                                           idoubling_crust_mantle(ispec)==IFLAG_80_MOHO))) then
-
-! layer with no transverse isotropy, use kappav and muv
-          kappal = kappavstore_crust_mantle(i,j,k,ispec)
-          mul = muvstore_crust_mantle(i,j,k,ispec)
-
-! use unrelaxed parameters if attenuation
-    if(ATTENUATION_VAL) mul = mul * one_minus_sum_beta_use
-
-          lambdalplus2mul = kappal + FOUR_THIRDS * mul
-          lambdal = lambdalplus2mul - 2.*mul
-
-! compute stress sigma
-
-          sigma_xx = lambdalplus2mul*duxdxl + lambdal*duydyl_plus_duzdzl
-          sigma_yy = lambdalplus2mul*duydyl + lambdal*duxdxl_plus_duzdzl
-          sigma_zz = lambdalplus2mul*duzdzl + lambdal*duxdxl_plus_duydyl
-
-          sigma_xy = mul*duxdyl_plus_duydxl
-          sigma_xz = mul*duzdxl_plus_duxdzl
-          sigma_yz = mul*duzdyl_plus_duydzl
-
-    else
-
-! use Kappa and mu from transversely isotropic model
-      kappavl = kappavstore_crust_mantle(i,j,k,ispec)
-      muvl = muvstore_crust_mantle(i,j,k,ispec)
-
-      kappahl = kappahstore_crust_mantle(i,j,k,ispec)
-      muhl = muhstore_crust_mantle(i,j,k,ispec)
-
-! use unrelaxed parameters if attenuation
-! eta does not need to be shifted since it is a ratio
-  if(ATTENUATION_VAL) then
-    muvl = muvl * one_minus_sum_beta_use
-    muhl = muhl * one_minus_sum_beta_use
-  endif
-
-  rhovpvsq = kappavl + FOUR_THIRDS * muvl  !!! that is C
-  rhovphsq = kappahl + FOUR_THIRDS * muhl  !!! that is A
-
-  rhovsvsq = muvl  !!! that is L
-  rhovshsq = muhl  !!! that is N
-
-  eta_aniso = eta_anisostore_crust_mantle(i,j,k,ispec)  !!! that is  F / (A - 2 L)
-
-! use mesh coordinates to get theta and phi
-! ystore_crust_mantle and zstore_crust_mantle contain theta and phi
-
-  iglob = ibool_crust_mantle(i,j,k,ispec)
-  theta = ystore_crust_mantle(iglob)
-  phi = zstore_crust_mantle(iglob)
-
-  costheta = cos(theta)
-  sintheta = sin(theta)
-  cosphi = cos(phi)
-  sinphi = sin(phi)
-
-  costhetasq = costheta * costheta
-  sinthetasq = sintheta * sintheta
-  cosphisq = cosphi * cosphi
-  sinphisq = sinphi * sinphi
-
-  costhetafour = costhetasq * costhetasq
-  sinthetafour = sinthetasq * sinthetasq
-  cosphifour = cosphisq * cosphisq
-  sinphifour = sinphisq * sinphisq
-
-  costwotheta = cos(2.*theta)
-  sintwotheta = sin(2.*theta)
-  costwophi = cos(2.*phi)
-  sintwophi = sin(2.*phi)
-
-  cosfourtheta = cos(4.*theta)
-  cosfourphi = cos(4.*phi)
-
-  costwothetasq = costwotheta * costwotheta
-
-  costwophisq = costwophi * costwophi
-  sintwophisq = sintwophi * sintwophi
-
-  etaminone = eta_aniso - 1.
-  twoetaminone = 2. * eta_aniso - 1.
-
-! precompute some products to reduce the CPU time
-
-      two_eta_aniso = 2.*eta_aniso
-      four_eta_aniso = 4.*eta_aniso
-      six_eta_aniso = 6.*eta_aniso
-
-      two_rhovpvsq = 2.*rhovpvsq
-      two_rhovphsq = 2.*rhovphsq
-      two_rhovsvsq = 2.*rhovsvsq
-      two_rhovshsq = 2.*rhovshsq
-
-      four_rhovpvsq = 4.*rhovpvsq
-      four_rhovphsq = 4.*rhovphsq
-      four_rhovsvsq = 4.*rhovsvsq
-      four_rhovshsq = 4.*rhovshsq
-
-! the 21 anisotropic coefficients computed using Mathematica
-
- c11 = rhovphsq*sinphifour + 2.*cosphisq*sinphisq* &
-   (rhovphsq*costhetasq + (eta_aniso*rhovphsq + two_rhovsvsq - two_eta_aniso*rhovsvsq)* &
-      sinthetasq) + cosphifour* &
-   (rhovphsq*costhetafour + 2.*(eta_aniso*rhovphsq + two_rhovsvsq - two_eta_aniso*rhovsvsq)* &
-      costhetasq*sinthetasq + rhovpvsq*sinthetafour)
-
- c12 = ((rhovphsq - two_rhovshsq)*(3. + cosfourphi)*costhetasq)/4. - &
-  four_rhovshsq*cosphisq*costhetasq*sinphisq + &
-  (rhovphsq*(11. + 4.*costwotheta + cosfourtheta)*sintwophisq)/32. + &
-  eta_aniso*(rhovphsq - two_rhovsvsq)*(cosphifour + &
-     2.*cosphisq*costhetasq*sinphisq + sinphifour)*sinthetasq + &
-  rhovpvsq*cosphisq*sinphisq*sinthetafour - &
-  rhovsvsq*sintwophisq*sinthetafour
-
- c13 = (cosphisq*(rhovphsq + six_eta_aniso*rhovphsq + rhovpvsq - four_rhovsvsq - &
-       12.*eta_aniso*rhovsvsq + (twoetaminone*rhovphsq - rhovpvsq + four_rhovsvsq - &
-          four_eta_aniso*rhovsvsq)*cosfourtheta))/8. + &
-  sinphisq*(eta_aniso*(rhovphsq - two_rhovsvsq)*costhetasq + &
-     (rhovphsq - two_rhovshsq)*sinthetasq)
-
- c14 = costheta*sinphi*((cosphisq* &
-       (-rhovphsq + rhovpvsq + four_rhovshsq - four_rhovsvsq + &
-         (-rhovphsq + two_eta_aniso*rhovphsq - rhovpvsq + four_rhovsvsq - &
-            four_eta_aniso*rhovsvsq)*costwotheta))/2. + &
-    (etaminone*rhovphsq + 2.*(rhovshsq - eta_aniso*rhovsvsq))*sinphisq)* sintheta
-
- c15 = cosphi*costheta*((cosphisq* (-rhovphsq + rhovpvsq + &
-         (twoetaminone*rhovphsq - rhovpvsq + four_rhovsvsq - four_eta_aniso*rhovsvsq)* &
-          costwotheta))/2. + etaminone*(rhovphsq - two_rhovsvsq)*sinphisq)*sintheta
-
- c16 = (cosphi*sinphi*(cosphisq* (-rhovphsq + rhovpvsq + &
-         (-rhovphsq + two_eta_aniso*rhovphsq - rhovpvsq + four_rhovsvsq - &
-            four_eta_aniso*rhovsvsq)*costwotheta) + &
-      2.*etaminone*(rhovphsq - two_rhovsvsq)*sinphisq)*sinthetasq)/2.
-
- c22 = rhovphsq*cosphifour + 2.*cosphisq*sinphisq* &
-   (rhovphsq*costhetasq + (eta_aniso*rhovphsq + two_rhovsvsq - two_eta_aniso*rhovsvsq)* &
-      sinthetasq) + sinphifour* &
-   (rhovphsq*costhetafour + 2.*(eta_aniso*rhovphsq + two_rhovsvsq - two_eta_aniso*rhovsvsq)* &
-      costhetasq*sinthetasq + rhovpvsq*sinthetafour)
-
- c23 = ((rhovphsq + six_eta_aniso*rhovphsq + rhovpvsq - four_rhovsvsq - 12.*eta_aniso*rhovsvsq + &
-       (twoetaminone*rhovphsq - rhovpvsq + four_rhovsvsq - four_eta_aniso*rhovsvsq)* &
-        cosfourtheta)*sinphisq)/8. + &
-  cosphisq*(eta_aniso*(rhovphsq - two_rhovsvsq)*costhetasq + &
-     (rhovphsq - two_rhovshsq)*sinthetasq)
-
- c24 = costheta*sinphi*(etaminone*(rhovphsq - two_rhovsvsq)*cosphisq + &
-    ((-rhovphsq + rhovpvsq + (twoetaminone*rhovphsq - rhovpvsq + &
-            four_rhovsvsq - four_eta_aniso*rhovsvsq)*costwotheta)*sinphisq)/2.)*sintheta
-
- c25 = cosphi*costheta*((etaminone*rhovphsq + 2.*(rhovshsq - eta_aniso*rhovsvsq))* &
-     cosphisq + ((-rhovphsq + rhovpvsq + four_rhovshsq - four_rhovsvsq + &
-         (-rhovphsq + two_eta_aniso*rhovphsq - rhovpvsq + four_rhovsvsq - &
-            four_eta_aniso*rhovsvsq)*costwotheta)*sinphisq)/2.)*sintheta
-
- c26 = (cosphi*sinphi*(2.*etaminone*(rhovphsq - two_rhovsvsq)*cosphisq + &
-      (-rhovphsq + rhovpvsq + (-rhovphsq + two_eta_aniso*rhovphsq - rhovpvsq + four_rhovsvsq - &
-            four_eta_aniso*rhovsvsq)*costwotheta)*sinphisq)*sinthetasq)/2.
-
- c33 = rhovpvsq*costhetafour + 2.*(eta_aniso*(rhovphsq - two_rhovsvsq) + two_rhovsvsq)* &
-   costhetasq*sinthetasq + rhovphsq*sinthetafour
-
- c34 = -((rhovphsq - rhovpvsq + (twoetaminone*rhovphsq - rhovpvsq + four_rhovsvsq &
-           - four_eta_aniso*rhovsvsq)*costwotheta)*sinphi*sintwotheta)/4.
-
- c35 = -(cosphi*(rhovphsq - rhovpvsq + &
-       (twoetaminone*rhovphsq - rhovpvsq + four_rhovsvsq - four_eta_aniso*rhovsvsq)* &
-        costwotheta)*sintwotheta)/4.
-
- c36 = -((rhovphsq - rhovpvsq - four_rhovshsq + four_rhovsvsq + &
-       (twoetaminone*rhovphsq - rhovpvsq + four_rhovsvsq - four_eta_aniso*rhovsvsq)* &
-        costwotheta)*sintwophi*sinthetasq)/4.
-
- c44 = cosphisq*(rhovsvsq*costhetasq + rhovshsq*sinthetasq) + &
-  sinphisq*(rhovsvsq*costwothetasq + &
-     (rhovphsq - two_eta_aniso*rhovphsq + rhovpvsq + four_eta_aniso*rhovsvsq)*costhetasq* sinthetasq)
-
- c45 = ((rhovphsq - two_eta_aniso*rhovphsq + rhovpvsq - two_rhovshsq - two_rhovsvsq + &
-      four_eta_aniso*rhovsvsq + (rhovphsq - two_eta_aniso*rhovphsq + rhovpvsq + &
-         4.*etaminone*rhovsvsq)*costwotheta)*sintwophi*sinthetasq)/4.
-
- c46 = -(cosphi*costheta*((rhovshsq - rhovsvsq)*cosphisq - &
-      ((rhovphsq - two_eta_aniso*rhovphsq + rhovpvsq - two_rhovshsq - two_rhovsvsq + &
-           four_eta_aniso*rhovsvsq + (-rhovphsq + two_eta_aniso*rhovphsq - rhovpvsq + &
-              four_rhovsvsq - four_eta_aniso*rhovsvsq)*costwotheta)*sinphisq)/2.)* sintheta)
-
- c55 = sinphisq*(rhovsvsq*costhetasq + rhovshsq*sinthetasq) + &
-  cosphisq*(rhovsvsq*costwothetasq + &
-     (rhovphsq - two_eta_aniso*rhovphsq + rhovpvsq + four_eta_aniso*rhovsvsq)*costhetasq* sinthetasq)
-
- c56 = costheta*sinphi*((cosphisq* &
-       (rhovphsq - two_eta_aniso*rhovphsq + rhovpvsq - two_rhovshsq - two_rhovsvsq + &
-         four_eta_aniso*rhovsvsq + (-rhovphsq + two_eta_aniso*rhovphsq - rhovpvsq + &
-            four_rhovsvsq - four_eta_aniso*rhovsvsq)*costwotheta))/2. + &
-    (-rhovshsq + rhovsvsq)*sinphisq)*sintheta
-
- c66 = rhovshsq*costwophisq*costhetasq - &
-  2.*(rhovphsq - two_rhovshsq)*cosphisq*costhetasq*sinphisq + &
-  (rhovphsq*(11. + 4.*costwotheta + cosfourtheta)*sintwophisq)/32. - &
-  (rhovsvsq*(-6. - 2.*cosfourphi + cos(4.*phi - 2.*theta) - 2.*costwotheta + &
-       cos(2.*(2.*phi + theta)))*sinthetasq)/8. + &
-  rhovpvsq*cosphisq*sinphisq*sinthetafour - &
-  (eta_aniso*(rhovphsq - two_rhovsvsq)*sintwophisq*sinthetafour)/2.
-
-! general expression of stress tensor for full Cijkl with 21 coefficients
-
-     sigma_xx = c11*duxdxl + c16*duxdyl_plus_duydxl + c12*duydyl + &
-               c15*duzdxl_plus_duxdzl + c14*duzdyl_plus_duydzl + c13*duzdzl
-
-     sigma_yy = c12*duxdxl + c26*duxdyl_plus_duydxl + c22*duydyl + &
-               c25*duzdxl_plus_duxdzl + c24*duzdyl_plus_duydzl + c23*duzdzl
-
-     sigma_zz = c13*duxdxl + c36*duxdyl_plus_duydxl + c23*duydyl + &
-               c35*duzdxl_plus_duxdzl + c34*duzdyl_plus_duydzl + c33*duzdzl
-
-     sigma_xy = c16*duxdxl + c66*duxdyl_plus_duydxl + c26*duydyl + &
-               c56*duzdxl_plus_duxdzl + c46*duzdyl_plus_duydzl + c36*duzdzl
-
-     sigma_xz = c15*duxdxl + c56*duxdyl_plus_duydxl + c25*duydyl + &
-               c55*duzdxl_plus_duxdzl + c45*duzdyl_plus_duydzl + c35*duzdzl
-
-     sigma_yz = c14*duxdxl + c46*duxdyl_plus_duydxl + c24*duydyl + &
-               c45*duzdxl_plus_duxdzl + c44*duzdyl_plus_duydzl + c34*duzdzl
-
-  endif
-
-  endif   ! end of test whether isotropic or anisotropic element
-
-! subtract memory variables if attenuation
-  if(ATTENUATION_VAL) then
-    do i_sls = 1,N_SLS
-      R_xx_val = R_memory_crust_mantle(1,i_sls,i,j,k,ispec)
-      R_yy_val = R_memory_crust_mantle(2,i_sls,i,j,k,ispec)
-      sigma_xx = sigma_xx - R_xx_val
-      sigma_yy = sigma_yy - R_yy_val
-      sigma_zz = sigma_zz + R_xx_val + R_yy_val
-      sigma_xy = sigma_xy - R_memory_crust_mantle(3,i_sls,i,j,k,ispec)
-      sigma_xz = sigma_xz - R_memory_crust_mantle(4,i_sls,i,j,k,ispec)
-      sigma_yz = sigma_yz - R_memory_crust_mantle(5,i_sls,i,j,k,ispec)
-    enddo
-  endif
-
-! form dot product with test vector, symmetric form
-      tempx1(i,j,k) = jacobianl * (sigma_xx*xixl + sigma_xy*xiyl + sigma_xz*xizl)
-      tempy1(i,j,k) = jacobianl * (sigma_xy*xixl + sigma_yy*xiyl + sigma_yz*xizl)
-      tempz1(i,j,k) = jacobianl * (sigma_xz*xixl + sigma_yz*xiyl + sigma_zz*xizl)
-
-      tempx2(i,j,k) = jacobianl * (sigma_xx*etaxl + sigma_xy*etayl + sigma_xz*etazl)
-      tempy2(i,j,k) = jacobianl * (sigma_xy*etaxl + sigma_yy*etayl + sigma_yz*etazl)
-      tempz2(i,j,k) = jacobianl * (sigma_xz*etaxl + sigma_yz*etayl + sigma_zz*etazl)
-
-      tempx3(i,j,k) = jacobianl * (sigma_xx*gammaxl + sigma_xy*gammayl + sigma_xz*gammazl)
-      tempy3(i,j,k) = jacobianl * (sigma_xy*gammaxl + sigma_yy*gammayl + sigma_yz*gammazl)
-      tempz3(i,j,k) = jacobianl * (sigma_xz*gammaxl + sigma_yz*gammayl + sigma_zz*gammazl)
-
-          enddo
-        enddo
-      enddo
-
-    do k=1,NGLLZ
-      do j=1,NGLLY
-        do i=1,NGLLX
-
-          tempx1l = 0._CUSTOM_REAL
-          tempy1l = 0._CUSTOM_REAL
-          tempz1l = 0._CUSTOM_REAL
-
-          tempx2l = 0._CUSTOM_REAL
-          tempy2l = 0._CUSTOM_REAL
-          tempz2l = 0._CUSTOM_REAL
-
-          tempx3l = 0._CUSTOM_REAL
-          tempy3l = 0._CUSTOM_REAL
-          tempz3l = 0._CUSTOM_REAL
-
-          do l=1,NGLLX
-            fac1 = hprimewgll_xx(l,i)
-            tempx1l = tempx1l + tempx1(l,j,k)*fac1
-            tempy1l = tempy1l + tempy1(l,j,k)*fac1
-            tempz1l = tempz1l + tempz1(l,j,k)*fac1
-!!! can merge these loops because NGLLX = NGLLY = NGLLZ          enddo
-
-!!! can merge these loops because NGLLX = NGLLY = NGLLZ          do l=1,NGLLY
-            fac2 = hprimewgll_yy(l,j)
-            tempx2l = tempx2l + tempx2(i,l,k)*fac2
-            tempy2l = tempy2l + tempy2(i,l,k)*fac2
-            tempz2l = tempz2l + tempz2(i,l,k)*fac2
-!!! can merge these loops because NGLLX = NGLLY = NGLLZ          enddo
-
-!!! can merge these loops because NGLLX = NGLLY = NGLLZ          do l=1,NGLLZ
-            fac3 = hprimewgll_zz(l,k)
-            tempx3l = tempx3l + tempx3(i,j,l)*fac3
-            tempy3l = tempy3l + tempy3(i,j,l)*fac3
-            tempz3l = tempz3l + tempz3(i,j,l)*fac3
-          enddo
-
-          fac1 = wgllwgll_yz(j,k)
-          fac2 = wgllwgll_xz(i,k)
-          fac3 = wgllwgll_xy(i,j)
-
-          sum_terms(1,i,j,k) = - (fac1*tempx1l + fac2*tempx2l + fac3*tempx3l)
-          sum_terms(2,i,j,k) = - (fac1*tempy1l + fac2*tempy2l + fac3*tempy3l)
-          sum_terms(3,i,j,k) = - (fac1*tempz1l + fac2*tempz2l + fac3*tempz3l)
-
-        enddo
-      enddo
-    enddo
-
-! sum contributions from each element to the global mesh and add gravity terms
-    do k=1,NGLLZ
-      do j=1,NGLLY
-        do i=1,NGLLX
-          iglob = ibool_crust_mantle(i,j,k,ispec)
-          accel_crust_mantle(1,iglob) = accel_crust_mantle(1,iglob) + sum_terms(1,i,j,k)
-          accel_crust_mantle(2,iglob) = accel_crust_mantle(2,iglob) + sum_terms(2,i,j,k)
-          accel_crust_mantle(3,iglob) = accel_crust_mantle(3,iglob) + sum_terms(3,i,j,k)
-        enddo
-      enddo
-    enddo
-
-! update memory variables based upon a Runge-Kutta scheme.
-! convention for attenuation:
-! term in xx = 1
-! term in yy = 2
-! term in xy = 3
-! term in xz = 4
-! term in yz = 5
-! term in zz not computed since zero trace
-    if(ATTENUATION_VAL) then
-      do k = 1,NGLLZ
-        do j = 1,NGLLY
-          do i = 1,NGLLX
-            do i_sls = 1,N_SLS
-              do i_memory = 1,5
-! get coefficients for that standard linear solid
-! IMPROVE we use mu_v here even if there is some anisotropy
-! IMPROVE we should probably use an average value instead
-                R_memory_crust_mantle(i_memory,i_sls,i,j,k,ispec) = alphaval(i_sls) * &
-                  R_memory_crust_mantle(i_memory,i_sls,i,j,k,ispec) + &
-                  factor_common_crust_mantle(i_sls,1,1,1,iregion_selected) * muvstore_crust_mantle(i,j,k,ispec) * &
-                  (betaval(i_sls) * epsilondev_crust_mantle(i_memory,i,j,k,ispec) + &
-                  gammaval(i_sls) * epsilondev_loc(i_memory,i,j,k))
-              enddo
-            enddo
-          enddo
-        enddo
-      enddo
-    endif
-
-! save deviatoric strain for Runge-Kutta scheme
-  if(COMPUTE_AND_STORE_STRAIN) epsilondev_crust_mantle(:,:,:,:,ispec) = epsilondev_loc(:,:,:,:)
-
-  enddo   ! spectral element loop
-
-!=====================================================================
-!=====================================================================
-!=====================================================================
-
-! ****************************************************
-!   big loop over all spectral elements in the solid
-! ****************************************************
-
-! set acceleration to zero
-  if(icall == 1) accel_inner_core(:,:) = 0._CUSTOM_REAL
-
-  do ispec = 1,NSPEC_INNER_CORE
-
-! hide communications by computing the edges first
-    if((icall == 2 .and. is_on_a_slice_edge_inner_core(ispec)) .or. &
-       (icall == 1 .and. .not. is_on_a_slice_edge_inner_core(ispec))) cycle
-
-! exclude fictitious elements in central cube
-    if(idoubling_inner_core(ispec) /= IFLAG_IN_FICTITIOUS_CUBE) then
-
-    do k=1,NGLLZ
-      do j=1,NGLLY
-        do i=1,NGLLX
-
-          tempx1l = 0._CUSTOM_REAL
-          tempx2l = 0._CUSTOM_REAL
-          tempx3l = 0._CUSTOM_REAL
-
-          tempy1l = 0._CUSTOM_REAL
-          tempy2l = 0._CUSTOM_REAL
-          tempy3l = 0._CUSTOM_REAL
-
-          tempz1l = 0._CUSTOM_REAL
-          tempz2l = 0._CUSTOM_REAL
-          tempz3l = 0._CUSTOM_REAL
-
-          do l=1,NGLLX
-            hp1 = hprime_xx(i,l)
-            iglob = ibool_inner_core(l,j,k,ispec)
-            tempx1l = tempx1l + displ_inner_core(1,iglob)*hp1
-            tempy1l = tempy1l + displ_inner_core(2,iglob)*hp1
-            tempz1l = tempz1l + displ_inner_core(3,iglob)*hp1
-!!! can merge these loops because NGLLX = NGLLY = NGLLZ          enddo
-
-!!! can merge these loops because NGLLX = NGLLY = NGLLZ          do l=1,NGLLY
-            hp2 = hprime_yy(j,l)
-            iglob = ibool_inner_core(i,l,k,ispec)
-            tempx2l = tempx2l + displ_inner_core(1,iglob)*hp2
-            tempy2l = tempy2l + displ_inner_core(2,iglob)*hp2
-            tempz2l = tempz2l + displ_inner_core(3,iglob)*hp2
-!!! can merge these loops because NGLLX = NGLLY = NGLLZ          enddo
-
-!!! can merge these loops because NGLLX = NGLLY = NGLLZ          do l=1,NGLLZ
-            hp3 = hprime_zz(k,l)
-            iglob = ibool_inner_core(i,j,l,ispec)
-            tempx3l = tempx3l + displ_inner_core(1,iglob)*hp3
-            tempy3l = tempy3l + displ_inner_core(2,iglob)*hp3
-            tempz3l = tempz3l + displ_inner_core(3,iglob)*hp3
-          enddo
-
-!         get derivatives of ux, uy and uz with respect to x, y and z
-
-          xixl = xix_inner_core(i,j,k,ispec)
-          xiyl = xiy_inner_core(i,j,k,ispec)
-          xizl = xiz_inner_core(i,j,k,ispec)
-          etaxl = etax_inner_core(i,j,k,ispec)
-          etayl = etay_inner_core(i,j,k,ispec)
-          etazl = etaz_inner_core(i,j,k,ispec)
-          gammaxl = gammax_inner_core(i,j,k,ispec)
-          gammayl = gammay_inner_core(i,j,k,ispec)
-          gammazl = gammaz_inner_core(i,j,k,ispec)
-
-! compute the jacobian
-          jacobianl = 1._CUSTOM_REAL / (xixl*(etayl*gammazl-etazl*gammayl) &
-                        - xiyl*(etaxl*gammazl-etazl*gammaxl) &
-                        + xizl*(etaxl*gammayl-etayl*gammaxl))
-
-          duxdxl = xixl*tempx1l + etaxl*tempx2l + gammaxl*tempx3l
-          duxdyl = xiyl*tempx1l + etayl*tempx2l + gammayl*tempx3l
-          duxdzl = xizl*tempx1l + etazl*tempx2l + gammazl*tempx3l
-
-          duydxl = xixl*tempy1l + etaxl*tempy2l + gammaxl*tempy3l
-          duydyl = xiyl*tempy1l + etayl*tempy2l + gammayl*tempy3l
-          duydzl = xizl*tempy1l + etazl*tempy2l + gammazl*tempy3l
-
-          duzdxl = xixl*tempz1l + etaxl*tempz2l + gammaxl*tempz3l
-          duzdyl = xiyl*tempz1l + etayl*tempz2l + gammayl*tempz3l
-          duzdzl = xizl*tempz1l + etazl*tempz2l + gammazl*tempz3l
-
-! precompute some sums to save CPU time
-          duxdxl_plus_duydyl = duxdxl + duydyl
-          duxdxl_plus_duzdzl = duxdxl + duzdzl
-          duydyl_plus_duzdzl = duydyl + duzdzl
-          duxdyl_plus_duydxl = duxdyl + duydxl
-          duzdxl_plus_duxdzl = duzdxl + duxdzl
-          duzdyl_plus_duydzl = duzdyl + duydzl
-
-! compute deviatoric strain
-  if (COMPUTE_AND_STORE_STRAIN) then
-    epsilondev_loc(1,i,j,k) = duxdxl - ONE_THIRD * (duxdxl + duydyl + duzdzl)
-    epsilondev_loc(2,i,j,k) = duydyl - ONE_THIRD * (duxdxl + duydyl + duzdzl)
-    epsilondev_loc(3,i,j,k) = 0.5 * duxdyl_plus_duydxl
-    epsilondev_loc(4,i,j,k) = 0.5 * duzdxl_plus_duxdzl
-    epsilondev_loc(5,i,j,k) = 0.5 * duzdyl_plus_duydzl
-  endif
-
-  if(ATTENUATION_VAL) then
-! same attenuation everywhere in the inner core therefore no need to use Brian's routines
-!!!!!        radius_cr = xstore(ibool_inner_core(i,j,k,ispec))
-!!!!!        call get_attenuation_index(idoubling_inner_core(ispec), dble(radius_cr), iregion_selected_inner_core, .TRUE., AM_V)
-        minus_sum_beta =  one_minus_sum_beta_inner_core(1,1,1,iregion_selected_inner_core) - 1.0
-  endif ! ATTENUATION_VAL
-
-       if(ANISOTROPIC_INNER_CORE_VAL) then
-
-       else
-
-! inner core with no anisotropy, use kappav and muv for instance
-! layer with no anisotropy, use kappav and muv for instance
-          kappal = kappavstore_inner_core(i,j,k,ispec)
-          mul = muvstore_inner_core(i,j,k,ispec)
-
-! use unrelaxed parameters if attenuation
-  if(ATTENUATION_VAL) then
-      mul = mul * one_minus_sum_beta_inner_core(1,1,1,iregion_selected_inner_core)
-  endif
-
-          lambdalplus2mul = kappal + FOUR_THIRDS * mul
-          lambdal = lambdalplus2mul - 2.*mul
-
-! compute stress sigma
-
-          sigma_xx = lambdalplus2mul*duxdxl + lambdal*duydyl_plus_duzdzl
-          sigma_yy = lambdalplus2mul*duydyl + lambdal*duxdxl_plus_duzdzl
-          sigma_zz = lambdalplus2mul*duzdzl + lambdal*duxdxl_plus_duydyl
-
-          sigma_xy = mul*duxdyl_plus_duydxl
-          sigma_xz = mul*duzdxl_plus_duxdzl
-          sigma_yz = mul*duzdyl_plus_duydzl
-
-        endif
-
-! subtract memory variables if attenuation
-  if(ATTENUATION_VAL) then
-    do i_sls = 1,N_SLS
-      R_xx_val = R_memory_inner_core(1,i_sls,i,j,k,ispec)
-      R_yy_val = R_memory_inner_core(2,i_sls,i,j,k,ispec)
-      sigma_xx = sigma_xx - R_xx_val
-      sigma_yy = sigma_yy - R_yy_val
-      sigma_zz = sigma_zz + R_xx_val + R_yy_val
-      sigma_xy = sigma_xy - R_memory_inner_core(3,i_sls,i,j,k,ispec)
-      sigma_xz = sigma_xz - R_memory_inner_core(4,i_sls,i,j,k,ispec)
-      sigma_yz = sigma_yz - R_memory_inner_core(5,i_sls,i,j,k,ispec)
-    enddo
-  endif
-
-! form dot product with test vector, symmetric form
-      tempx1(i,j,k) = jacobianl * (sigma_xx*xixl + sigma_xy*xiyl + sigma_xz*xizl)
-      tempy1(i,j,k) = jacobianl * (sigma_xy*xixl + sigma_yy*xiyl + sigma_yz*xizl)
-      tempz1(i,j,k) = jacobianl * (sigma_xz*xixl + sigma_yz*xiyl + sigma_zz*xizl)
-
-      tempx2(i,j,k) = jacobianl * (sigma_xx*etaxl + sigma_xy*etayl + sigma_xz*etazl)
-      tempy2(i,j,k) = jacobianl * (sigma_xy*etaxl + sigma_yy*etayl + sigma_yz*etazl)
-      tempz2(i,j,k) = jacobianl * (sigma_xz*etaxl + sigma_yz*etayl + sigma_zz*etazl)
-
-      tempx3(i,j,k) = jacobianl * (sigma_xx*gammaxl + sigma_xy*gammayl + sigma_xz*gammazl)
-      tempy3(i,j,k) = jacobianl * (sigma_xy*gammaxl + sigma_yy*gammayl + sigma_yz*gammazl)
-      tempz3(i,j,k) = jacobianl * (sigma_xz*gammaxl + sigma_yz*gammayl + sigma_zz*gammazl)
-
-          enddo
-        enddo
-      enddo
-
-    do k=1,NGLLZ
-      do j=1,NGLLY
-        do i=1,NGLLX
-
-          tempx1l = 0._CUSTOM_REAL
-          tempy1l = 0._CUSTOM_REAL
-          tempz1l = 0._CUSTOM_REAL
-
-          tempx2l = 0._CUSTOM_REAL
-          tempy2l = 0._CUSTOM_REAL
-          tempz2l = 0._CUSTOM_REAL
-
-          tempx3l = 0._CUSTOM_REAL
-          tempy3l = 0._CUSTOM_REAL
-          tempz3l = 0._CUSTOM_REAL
-
-          do l=1,NGLLX
-            fac1 = hprimewgll_xx(l,i)
-            tempx1l = tempx1l + tempx1(l,j,k)*fac1
-            tempy1l = tempy1l + tempy1(l,j,k)*fac1
-            tempz1l = tempz1l + tempz1(l,j,k)*fac1
-!!! can merge these loops because NGLLX = NGLLY = NGLLZ          enddo
-
-!!! can merge these loops because NGLLX = NGLLY = NGLLZ          do l=1,NGLLY
-            fac2 = hprimewgll_yy(l,j)
-            tempx2l = tempx2l + tempx2(i,l,k)*fac2
-            tempy2l = tempy2l + tempy2(i,l,k)*fac2
-            tempz2l = tempz2l + tempz2(i,l,k)*fac2
-!!! can merge these loops because NGLLX = NGLLY = NGLLZ          enddo
-
-!!! can merge these loops because NGLLX = NGLLY = NGLLZ          do l=1,NGLLZ
-            fac3 = hprimewgll_zz(l,k)
-            tempx3l = tempx3l + tempx3(i,j,l)*fac3
-            tempy3l = tempy3l + tempy3(i,j,l)*fac3
-            tempz3l = tempz3l + tempz3(i,j,l)*fac3
-          enddo
-
-          fac1 = wgllwgll_yz(j,k)
-          fac2 = wgllwgll_xz(i,k)
-          fac3 = wgllwgll_xy(i,j)
-
-          sum_terms(1,i,j,k) = - (fac1*tempx1l + fac2*tempx2l + fac3*tempx3l)
-          sum_terms(2,i,j,k) = - (fac1*tempy1l + fac2*tempy2l + fac3*tempy3l)
-          sum_terms(3,i,j,k) = - (fac1*tempz1l + fac2*tempz2l + fac3*tempz3l)
-
-        enddo
-      enddo
-    enddo
-
-! sum contributions from each element to the global mesh and add gravity terms
-    do k=1,NGLLZ
-      do j=1,NGLLY
-        do i=1,NGLLX
-          iglob = ibool_inner_core(i,j,k,ispec)
-          accel_inner_core(:,iglob) = accel_inner_core(:,iglob) + sum_terms(:,i,j,k)
-        enddo
-      enddo
-    enddo
-
-! update memory variables based upon a Runge-Kutta scheme.
-! convention for attenuation:
-! term in xx = 1
-! term in yy = 2
-! term in xy = 3
-! term in xz = 4
-! term in yz = 5
-! term in zz not computed since zero trace
-    if(ATTENUATION_VAL) then
-      do k = 1,NGLLZ
-        do j = 1,NGLLY
-          do i = 1,NGLLX
-            do i_sls = 1,N_SLS
-              do i_memory = 1,5
-                R_memory_inner_core(i_memory,i_sls,i,j,k,ispec) = &
-                  alphaval(i_sls) * &
-                  R_memory_inner_core(i_memory,i_sls,i,j,k,ispec) + muvstore_inner_core(i,j,k,ispec) * &
-                  factor_common_inner_core(i_sls,1,1,1,iregion_selected_inner_core) * &
-                  (betaval(i_sls) * &
-                epsilondev_inner_core(i_memory,i,j,k,ispec) + gammaval(i_sls) * epsilondev_loc(i_memory,i,j,k))
-              enddo
-            enddo
-          enddo
-        enddo
-      enddo
-    endif
-
-! save deviatoric strain for Runge-Kutta scheme
-    if(COMPUTE_AND_STORE_STRAIN) epsilondev_inner_core(:,:,:,:,ispec) = epsilondev_loc(:,:,:,:)
-
-   endif   ! end test to exclude fictitious elements in central cube
-
-  enddo ! spectral element loop
-
-  end subroutine compute_forces_CM_IC
-

Copied: seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/compute_forces_OC.F90 (from rev 12881, seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/compute_forces_OC.f90)
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/compute_forces_OC.F90	                        (rev 0)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/compute_forces_OC.F90	2008-09-17 18:17:25 UTC (rev 12910)
@@ -0,0 +1,310 @@
+!=====================================================================
+!
+!          S p e c f e m 3 D  G l o b e  V e r s i o n  4 . 1
+!          --------------------------------------------------
+!
+!          Main authors: Dimitri Komatitsch and Jeroen Tromp
+!    Seismological Laboratory, California Institute of Technology, USA
+!             and University of Pau / CNRS / INRIA, France
+! (c) California Institute of Technology and University of Pau / CNRS / INRIA
+!                            August 2008
+!
+! This program is free software; you can redistribute it and/or modify
+! it under the terms of the GNU General Public License as published by
+! the Free Software Foundation; either version 2 of the License, or
+! (at your option) any later version.
+!
+! This program is distributed in the hope that it will be useful,
+! but WITHOUT ANY WARRANTY; without even the implied warranty of
+! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+! GNU General Public License for more details.
+!
+! You should have received a copy of the GNU General Public License along
+! with this program; if not, write to the Free Software Foundation, Inc.,
+! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+!
+!=====================================================================
+
+! compute forces in the outer core (i.e., the fluid region)
+  subroutine compute_forces_OC(d_ln_density_dr_table, &
+          displ_outer_core,accel_outer_core,xstore_outer_core,ystore_outer_core,zstore_outer_core, &
+          xix_outer_core,xiy_outer_core,xiz_outer_core, &
+          etax_outer_core,etay_outer_core,etaz_outer_core, &
+          gammax_outer_core,gammay_outer_core,gammaz_outer_core, &
+#ifdef USE_MPI
+          is_on_a_slice_edge_outer_core, &
+          myrank,iproc_xi,iproc_eta,ichunk,addressing, &
+          iboolleft_xi_outer_core,iboolright_xi_outer_core,iboolleft_eta_outer_core,iboolright_eta_outer_core, &
+          npoin2D_faces_outer_core,npoin2D_xi_outer_core,npoin2D_eta_outer_core, &
+          iboolfaces_outer_core,iboolcorner_outer_core, &
+          iprocfrom_faces,iprocto_faces,imsg_type, &
+          iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
+          buffer_send_faces,buffer_received_faces,npoin2D_max_all, &
+          buffer_send_chunkcorners_scalar,buffer_recv_chunkcorners_scalar, &
+          NUM_MSG_TYPES,iphase, &
+#endif
+          hprime_xx,hprime_yy,hprime_zz, &
+          hprimewgll_xx,hprimewgll_yy,hprimewgll_zz, &
+          wgllwgll_xy,wgllwgll_xz,wgllwgll_yz,ibool_outer_core,icall)
+
+  implicit none
+
+  include "constants.h"
+
+! include values created by the mesher
+! done for performance only using static allocation to allow for loop unrolling
+  include "values_from_mesher.h"
+
+#ifdef USE_MPI
+  integer :: ichunk,iproc_xi,iproc_eta,myrank
+
+  integer, dimension(NCHUNKS_VAL,0:NPROC_XI_VAL-1,0:NPROC_ETA_VAL-1) :: addressing
+
+  integer, dimension(NGLOB2DMAX_XMIN_XMAX_OC) :: iboolleft_xi_outer_core,iboolright_xi_outer_core
+  integer, dimension(NGLOB2DMAX_YMIN_YMAX_OC) :: iboolleft_eta_outer_core,iboolright_eta_outer_core
+
+  integer npoin2D_faces_outer_core(NUMFACES_SHARED)
+  integer, dimension(NB_SQUARE_EDGES_ONEDIR) :: npoin2D_xi_outer_core,npoin2D_eta_outer_core
+
+! communication pattern for faces between chunks
+  integer, dimension(NUMMSGS_FACES_VAL) :: iprocfrom_faces,iprocto_faces,imsg_type
+
+! communication pattern for corners between chunks
+  integer, dimension(NCORNERSCHUNKS_VAL) :: iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners
+
+! indirect addressing for each message for faces and corners of the chunks
+! a given slice can belong to at most one corner and at most two faces
+  integer, dimension(NGLOB2DMAX_XY_VAL_OC,NUMFACES_SHARED) :: iboolfaces_outer_core
+
+! buffers for send and receive between faces of the slices and the chunks
+! we use the same buffers to assemble scalars and vectors because vectors are
+! always three times bigger and therefore scalars can use the first part
+! of the vector buffer in memory even if it has an additional index here
+! allocate these automatic arrays in the memory stack to avoid memory fragmentation with "allocate()"
+  integer :: npoin2D_max_all
+  real(kind=CUSTOM_REAL), dimension(NDIM,npoin2D_max_all) :: buffer_send_faces,buffer_received_faces
+
+  integer, dimension(NGLOB1D_RADIAL_OC,NUMCORNERS_SHARED) :: iboolcorner_outer_core
+
+  real(kind=CUSTOM_REAL), dimension(NGLOB1D_RADIAL_OC) :: buffer_send_chunkcorners_scalar,buffer_recv_chunkcorners_scalar
+
+! number of message types
+  integer NUM_MSG_TYPES
+
+  logical, dimension(NSPEC_OUTER_CORE) :: is_on_a_slice_edge_outer_core
+
+  integer :: iphase
+#endif
+
+  integer :: icall
+
+! displacement and acceleration
+  real(kind=CUSTOM_REAL), dimension(NGLOB_OUTER_CORE) :: displ_outer_core,accel_outer_core
+
+! arrays with mesh parameters per slice
+  integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_OUTER_CORE) :: ibool_outer_core
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_OUTER_CORE) :: xix_outer_core,xiy_outer_core,xiz_outer_core, &
+                      etax_outer_core,etay_outer_core,etaz_outer_core,gammax_outer_core,gammay_outer_core,gammaz_outer_core
+
+! array with derivatives of Lagrange polynomials and precalculated products
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLX) :: hprime_xx,hprimewgll_xx
+  real(kind=CUSTOM_REAL), dimension(NGLLY,NGLLY) :: hprime_yy,hprimewgll_yy
+  real(kind=CUSTOM_REAL), dimension(NGLLZ,NGLLZ) :: hprime_zz,hprimewgll_zz
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY) :: wgllwgll_xy
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLZ) :: wgllwgll_xz
+  real(kind=CUSTOM_REAL), dimension(NGLLY,NGLLZ) :: wgllwgll_yz
+
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: tempx1,tempx2,tempx3
+
+! for gravity
+  integer int_radius
+  double precision radius,theta,phi
+  double precision cos_theta,sin_theta,cos_phi,sin_phi
+  double precision, dimension(NRAD_GRAVITY) :: d_ln_density_dr_table
+  real(kind=CUSTOM_REAL), dimension(NGLOB_OUTER_CORE) :: xstore_outer_core,ystore_outer_core,zstore_outer_core
+
+  integer ispec,iglob
+  integer i,j,k,l
+
+  real(kind=CUSTOM_REAL) xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl
+  real(kind=CUSTOM_REAL) dpotentialdxl,dpotentialdyl,dpotentialdzl
+  real(kind=CUSTOM_REAL) tempx1l,tempx2l,tempx3l
+
+  double precision grad_x_ln_rho,grad_y_ln_rho,grad_z_ln_rho
+
+  integer :: computed_elements
+
+! ****************************************************
+!   big loop over all spectral elements in the fluid
+! ****************************************************
+
+! set acceleration to zero
+  if(icall == 1) accel_outer_core(:) = 0._CUSTOM_REAL
+
+  computed_elements = 0
+
+  do ispec = 1,NSPEC_OUTER_CORE
+
+#ifdef USE_MPI
+! hide communications by computing the edges first
+    if((icall == 2 .and. is_on_a_slice_edge_outer_core(ispec)) .or. &
+       (icall == 1 .and. .not. is_on_a_slice_edge_outer_core(ispec))) cycle
+
+! process the communications every ELEMENTS_BETWEEN_NONBLOCKING elements
+    computed_elements = computed_elements + 1
+    if (USE_NONBLOCKING_COMMS .and. icall == 2 .and. mod(computed_elements,ELEMENTS_BETWEEN_NONBLOCKING) == 0) &
+         call assemble_MPI_scalar(myrank,accel_outer_core,NGLOB_OUTER_CORE, &
+            iproc_xi,iproc_eta,ichunk,addressing, &
+            iboolleft_xi_outer_core,iboolright_xi_outer_core,iboolleft_eta_outer_core,iboolright_eta_outer_core, &
+            npoin2D_faces_outer_core,npoin2D_xi_outer_core,npoin2D_eta_outer_core, &
+            iboolfaces_outer_core,iboolcorner_outer_core, &
+            iprocfrom_faces,iprocto_faces,imsg_type, &
+            iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
+            buffer_send_faces,buffer_received_faces,npoin2D_max_all, &
+            buffer_send_chunkcorners_scalar,buffer_recv_chunkcorners_scalar, &
+            NUMMSGS_FACES_VAL,NUM_MSG_TYPES,NCORNERSCHUNKS_VAL, &
+            NPROC_XI_VAL,NPROC_ETA_VAL,NGLOB1D_RADIAL_OC, &
+            NGLOB2DMAX_XMIN_XMAX_OC,NGLOB2DMAX_YMIN_YMAX_OC,NGLOB2DMAX_XY_VAL_OC,NCHUNKS_VAL,iphase)
+#endif
+
+    do k=1,NGLLZ
+      do j=1,NGLLY
+        do i=1,NGLLX
+
+          iglob = ibool_outer_core(i,j,k,ispec)
+
+          tempx1l = 0._CUSTOM_REAL
+          tempx2l = 0._CUSTOM_REAL
+          tempx3l = 0._CUSTOM_REAL
+
+          do l=1,NGLLX
+            tempx1l = tempx1l + displ_outer_core(ibool_outer_core(l,j,k,ispec)) * hprime_xx(i,l)
+!!! can merge these loops because NGLLX = NGLLY = NGLLZ          enddo
+
+!!! can merge these loops because NGLLX = NGLLY = NGLLZ          do l=1,NGLLY
+            tempx2l = tempx2l + displ_outer_core(ibool_outer_core(i,l,k,ispec)) * hprime_yy(j,l)
+!!! can merge these loops because NGLLX = NGLLY = NGLLZ          enddo
+
+!!! can merge these loops because NGLLX = NGLLY = NGLLZ          do l=1,NGLLZ
+            tempx3l = tempx3l + displ_outer_core(ibool_outer_core(i,j,l,ispec)) * hprime_zz(k,l)
+          enddo
+
+!         get derivatives of velocity potential with respect to x, y and z
+
+          xixl = xix_outer_core(i,j,k,ispec)
+          xiyl = xiy_outer_core(i,j,k,ispec)
+          xizl = xiz_outer_core(i,j,k,ispec)
+          etaxl = etax_outer_core(i,j,k,ispec)
+          etayl = etay_outer_core(i,j,k,ispec)
+          etazl = etaz_outer_core(i,j,k,ispec)
+          gammaxl = gammax_outer_core(i,j,k,ispec)
+          gammayl = gammay_outer_core(i,j,k,ispec)
+          gammazl = gammaz_outer_core(i,j,k,ispec)
+
+! compute the jacobian
+          jacobianl = 1._CUSTOM_REAL / (xixl*(etayl*gammazl-etazl*gammayl) &
+                        - xiyl*(etaxl*gammazl-etazl*gammaxl) &
+                        + xizl*(etaxl*gammayl-etayl*gammaxl))
+
+          dpotentialdxl = xixl*tempx1l + etaxl*tempx2l + gammaxl*tempx3l
+          dpotentialdyl = xiyl*tempx1l + etayl*tempx2l + gammayl*tempx3l
+          dpotentialdzl = xizl*tempx1l + etazl*tempx2l + gammazl*tempx3l
+
+! add (chi/rho)grad(rho) term in no gravity case
+
+! With regards to the non-gravitating case: we cannot set N^2 = 0 *and* let g = 0.
+! We can *either* assume N^2 = 0 but keep gravity g, *or* we can assume that gravity
+! is negligible to begin with, as in our GJI 2002a, in which case N does not arise.
+! We get:
+!
+! \ddot\chi = \rho^{-1}\kappa\bdel\cdot(\bdel\chi+\chi\bdel\ln\rho)
+!
+! Then the displacement is
+!
+! \bu = \bdel\chi+\chi\bdel\ln\rho = \rho^{-1}\bdel(\rho\chi)
+!
+! and the pressure is
+!
+! p = -\rho\ddot{\chi}
+!
+! Thus in our 2002b GJI paper eqn (21) is wrong, and equation (41)
+! in our AGU monograph is incorrect; these equations should be replaced by
+!
+! \ddot\chi = \rho^{-1}\kappa\bdel\cdot(\bdel\chi+\chi\bdel\ln\rho)
+!
+! Note that the fluid potential we use in GJI 2002a differs from the one used here:
+!
+! \chi_GJI2002a = \rho\partial\t\chi
+!
+! such that
+!
+! \bv = \partial_t\bu=\rho^{-1}\bdel\chi_GJI2002a  (GJI 2002a eqn 20)
+!
+! p = - \partial_t\chi_GJI2002a (GJI 2002a eqn 19)
+
+! use mesh coordinates to get theta and phi
+! x y z contain r theta phi
+
+      radius = dble(xstore_outer_core(iglob))
+      theta = dble(ystore_outer_core(iglob))
+      phi = dble(zstore_outer_core(iglob))
+
+      cos_theta = dcos(theta)
+      sin_theta = dsin(theta)
+      cos_phi = dcos(phi)
+      sin_phi = dsin(phi)
+
+      int_radius = nint(radius * R_EARTH_KM * 10.d0)
+
+! grad(rho)/rho in Cartesian components
+      grad_x_ln_rho = sin_theta * cos_phi * d_ln_density_dr_table(int_radius)
+      grad_y_ln_rho = sin_theta * sin_phi * d_ln_density_dr_table(int_radius)
+      grad_z_ln_rho = cos_theta * d_ln_density_dr_table(int_radius)
+
+! adding (chi/rho)grad(rho)
+      dpotentialdxl = dpotentialdxl + displ_outer_core(iglob) * grad_x_ln_rho
+      dpotentialdyl = dpotentialdyl + displ_outer_core(iglob) * grad_y_ln_rho
+      dpotentialdzl = dpotentialdzl + displ_outer_core(iglob) * grad_z_ln_rho
+
+          tempx1(i,j,k) = jacobianl*(xixl*dpotentialdxl + xiyl*dpotentialdyl + xizl*dpotentialdzl)
+          tempx2(i,j,k) = jacobianl*(etaxl*dpotentialdxl + etayl*dpotentialdyl + etazl*dpotentialdzl)
+          tempx3(i,j,k) = jacobianl*(gammaxl*dpotentialdxl + gammayl*dpotentialdyl + gammazl*dpotentialdzl)
+
+          enddo
+        enddo
+      enddo
+
+    do k=1,NGLLZ
+      do j=1,NGLLY
+        do i=1,NGLLX
+
+          tempx1l = 0._CUSTOM_REAL
+          tempx2l = 0._CUSTOM_REAL
+          tempx3l = 0._CUSTOM_REAL
+
+          do l=1,NGLLX
+            tempx1l = tempx1l + tempx1(l,j,k) * hprimewgll_xx(l,i)
+!!! can merge these loops because NGLLX = NGLLY = NGLLZ          enddo
+
+!!! can merge these loops because NGLLX = NGLLY = NGLLZ          do l=1,NGLLY
+            tempx2l = tempx2l + tempx2(i,l,k) * hprimewgll_yy(l,j)
+!!! can merge these loops because NGLLX = NGLLY = NGLLZ          enddo
+
+!!! can merge these loops because NGLLX = NGLLY = NGLLZ          do l=1,NGLLZ
+            tempx3l = tempx3l + tempx3(i,j,l) * hprimewgll_zz(l,k)
+          enddo
+
+! sum contributions from each element to the global mesh and add gravity term
+
+          iglob = ibool_outer_core(i,j,k,ispec)
+          accel_outer_core(iglob) = accel_outer_core(iglob) - &
+            (wgllwgll_yz(j,k)*tempx1l + wgllwgll_xz(i,k)*tempx2l + wgllwgll_xy(i,j)*tempx3l)
+
+        enddo
+      enddo
+    enddo
+
+  enddo   ! spectral element loop
+
+  end subroutine compute_forces_OC
+

Deleted: seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/compute_forces_OC.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/compute_forces_OC.f90	2008-09-17 17:33:29 UTC (rev 12909)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/compute_forces_OC.f90	2008-09-17 18:17:25 UTC (rev 12910)
@@ -1,234 +0,0 @@
-!=====================================================================
-!
-!          S p e c f e m 3 D  G l o b e  V e r s i o n  4 . 1
-!          --------------------------------------------------
-!
-!          Main authors: Dimitri Komatitsch and Jeroen Tromp
-!    Seismological Laboratory, California Institute of Technology, USA
-!             and University of Pau / CNRS / INRIA, France
-! (c) California Institute of Technology and University of Pau / CNRS / INRIA
-!                            August 2008
-!
-! This program is free software; you can redistribute it and/or modify
-! it under the terms of the GNU General Public License as published by
-! the Free Software Foundation; either version 2 of the License, or
-! (at your option) any later version.
-!
-! This program is distributed in the hope that it will be useful,
-! but WITHOUT ANY WARRANTY; without even the implied warranty of
-! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-! GNU General Public License for more details.
-!
-! You should have received a copy of the GNU General Public License along
-! with this program; if not, write to the Free Software Foundation, Inc.,
-! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-!
-!=====================================================================
-
-! compute forces in the outer core (i.e., the fluid region)
-  subroutine compute_forces_OC(d_ln_density_dr_table, &
-          displfluid,accelfluid,xstore,ystore,zstore, &
-          xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
-          hprime_xx,hprime_yy,hprime_zz, &
-          hprimewgll_xx,hprimewgll_yy,hprimewgll_zz, &
-          wgllwgll_xy,wgllwgll_xz,wgllwgll_yz,ibool,icall,is_on_a_slice_edge_outer_core)
-
-  implicit none
-
-  include "constants.h"
-
-! include values created by the mesher
-! done for performance only using static allocation to allow for loop unrolling
-  include "values_from_mesher.h"
-
-  logical, dimension(NSPEC_OUTER_CORE) :: is_on_a_slice_edge_outer_core
-
-  integer :: icall
-
-! displacement and acceleration
-  real(kind=CUSTOM_REAL), dimension(nglob_outer_core) :: displfluid,accelfluid
-
-! arrays with mesh parameters per slice
-  integer, dimension(NGLLX,NGLLY,NGLLZ,nspec_outer_core) :: ibool
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,nspec_outer_core) :: xix,xiy,xiz, &
-                      etax,etay,etaz,gammax,gammay,gammaz
-
-! array with derivatives of Lagrange polynomials and precalculated products
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLX) :: hprime_xx,hprimewgll_xx
-  real(kind=CUSTOM_REAL), dimension(NGLLY,NGLLY) :: hprime_yy,hprimewgll_yy
-  real(kind=CUSTOM_REAL), dimension(NGLLZ,NGLLZ) :: hprime_zz,hprimewgll_zz
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY) :: wgllwgll_xy
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLZ) :: wgllwgll_xz
-  real(kind=CUSTOM_REAL), dimension(NGLLY,NGLLZ) :: wgllwgll_yz
-
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: tempx1,tempx2,tempx3
-
-! for gravity
-  integer int_radius
-  double precision radius,theta,phi
-  double precision cos_theta,sin_theta,cos_phi,sin_phi
-  double precision, dimension(NRAD_GRAVITY) :: d_ln_density_dr_table
-  real(kind=CUSTOM_REAL), dimension(nglob_outer_core) :: xstore,ystore,zstore
-
-  integer ispec,iglob
-  integer i,j,k,l
-
-  real(kind=CUSTOM_REAL) xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl
-  real(kind=CUSTOM_REAL) dpotentialdxl,dpotentialdyl,dpotentialdzl
-  real(kind=CUSTOM_REAL) tempx1l,tempx2l,tempx3l
-
-  double precision grad_x_ln_rho,grad_y_ln_rho,grad_z_ln_rho
-
-! ****************************************************
-!   big loop over all spectral elements in the fluid
-! ****************************************************
-
-! set acceleration to zero
-  if(icall == 1) accelfluid(:) = 0._CUSTOM_REAL
-
-  do ispec = 1,NSPEC_OUTER_CORE
-
-! hide communications by computing the edges first
-    if((icall == 2 .and. is_on_a_slice_edge_outer_core(ispec)) .or. &
-       (icall == 1 .and. .not. is_on_a_slice_edge_outer_core(ispec))) cycle
-
-    do k=1,NGLLZ
-      do j=1,NGLLY
-        do i=1,NGLLX
-
-          iglob = ibool(i,j,k,ispec)
-
-          tempx1l = 0._CUSTOM_REAL
-          tempx2l = 0._CUSTOM_REAL
-          tempx3l = 0._CUSTOM_REAL
-
-          do l=1,NGLLX
-            tempx1l = tempx1l + displfluid(ibool(l,j,k,ispec)) * hprime_xx(i,l)
-!!! can merge these loops because NGLLX = NGLLY = NGLLZ          enddo
-
-!!! can merge these loops because NGLLX = NGLLY = NGLLZ          do l=1,NGLLY
-            tempx2l = tempx2l + displfluid(ibool(i,l,k,ispec)) * hprime_yy(j,l)
-!!! can merge these loops because NGLLX = NGLLY = NGLLZ          enddo
-
-!!! can merge these loops because NGLLX = NGLLY = NGLLZ          do l=1,NGLLZ
-            tempx3l = tempx3l + displfluid(ibool(i,j,l,ispec)) * hprime_zz(k,l)
-          enddo
-
-!         get derivatives of velocity potential with respect to x, y and z
-
-          xixl = xix(i,j,k,ispec)
-          xiyl = xiy(i,j,k,ispec)
-          xizl = xiz(i,j,k,ispec)
-          etaxl = etax(i,j,k,ispec)
-          etayl = etay(i,j,k,ispec)
-          etazl = etaz(i,j,k,ispec)
-          gammaxl = gammax(i,j,k,ispec)
-          gammayl = gammay(i,j,k,ispec)
-          gammazl = gammaz(i,j,k,ispec)
-
-! compute the jacobian
-          jacobianl = 1._CUSTOM_REAL / (xixl*(etayl*gammazl-etazl*gammayl) &
-                        - xiyl*(etaxl*gammazl-etazl*gammaxl) &
-                        + xizl*(etaxl*gammayl-etayl*gammaxl))
-
-          dpotentialdxl = xixl*tempx1l + etaxl*tempx2l + gammaxl*tempx3l
-          dpotentialdyl = xiyl*tempx1l + etayl*tempx2l + gammayl*tempx3l
-          dpotentialdzl = xizl*tempx1l + etazl*tempx2l + gammazl*tempx3l
-
-! add (chi/rho)grad(rho) term in no gravity case
-
-! With regards to the non-gravitating case: we cannot set N^2 = 0 *and* let g = 0.
-! We can *either* assume N^2 = 0 but keep gravity g, *or* we can assume that gravity
-! is negligible to begin with, as in our GJI 2002a, in which case N does not arise.
-! We get:
-!
-! \ddot\chi = \rho^{-1}\kappa\bdel\cdot(\bdel\chi+\chi\bdel\ln\rho)
-!
-! Then the displacement is
-!
-! \bu = \bdel\chi+\chi\bdel\ln\rho = \rho^{-1}\bdel(\rho\chi)
-!
-! and the pressure is
-!
-! p = -\rho\ddot{\chi}
-!
-! Thus in our 2002b GJI paper eqn (21) is wrong, and equation (41)
-! in our AGU monograph is incorrect; these equations should be replaced by
-!
-! \ddot\chi = \rho^{-1}\kappa\bdel\cdot(\bdel\chi+\chi\bdel\ln\rho)
-!
-! Note that the fluid potential we use in GJI 2002a differs from the one used here:
-!
-! \chi_GJI2002a = \rho\partial\t\chi
-!
-! such that
-!
-! \bv = \partial_t\bu=\rho^{-1}\bdel\chi_GJI2002a  (GJI 2002a eqn 20)
-!
-! p = - \partial_t\chi_GJI2002a (GJI 2002a eqn 19)
-
-! use mesh coordinates to get theta and phi
-! x y z contain r theta phi
-
-      radius = dble(xstore(iglob))
-      theta = dble(ystore(iglob))
-      phi = dble(zstore(iglob))
-
-      cos_theta = dcos(theta)
-      sin_theta = dsin(theta)
-      cos_phi = dcos(phi)
-      sin_phi = dsin(phi)
-
-      int_radius = nint(radius * R_EARTH_KM * 10.d0)
-
-! grad(rho)/rho in Cartesian components
-      grad_x_ln_rho = sin_theta * cos_phi * d_ln_density_dr_table(int_radius)
-      grad_y_ln_rho = sin_theta * sin_phi * d_ln_density_dr_table(int_radius)
-      grad_z_ln_rho = cos_theta * d_ln_density_dr_table(int_radius)
-
-! adding (chi/rho)grad(rho)
-      dpotentialdxl = dpotentialdxl + displfluid(iglob) * grad_x_ln_rho
-      dpotentialdyl = dpotentialdyl + displfluid(iglob) * grad_y_ln_rho
-      dpotentialdzl = dpotentialdzl + displfluid(iglob) * grad_z_ln_rho
-
-          tempx1(i,j,k) = jacobianl*(xixl*dpotentialdxl + xiyl*dpotentialdyl + xizl*dpotentialdzl)
-          tempx2(i,j,k) = jacobianl*(etaxl*dpotentialdxl + etayl*dpotentialdyl + etazl*dpotentialdzl)
-          tempx3(i,j,k) = jacobianl*(gammaxl*dpotentialdxl + gammayl*dpotentialdyl + gammazl*dpotentialdzl)
-
-          enddo
-        enddo
-      enddo
-
-    do k=1,NGLLZ
-      do j=1,NGLLY
-        do i=1,NGLLX
-
-          tempx1l = 0._CUSTOM_REAL
-          tempx2l = 0._CUSTOM_REAL
-          tempx3l = 0._CUSTOM_REAL
-
-          do l=1,NGLLX
-            tempx1l = tempx1l + tempx1(l,j,k) * hprimewgll_xx(l,i)
-!!! can merge these loops because NGLLX = NGLLY = NGLLZ          enddo
-
-!!! can merge these loops because NGLLX = NGLLY = NGLLZ          do l=1,NGLLY
-            tempx2l = tempx2l + tempx2(i,l,k) * hprimewgll_yy(l,j)
-!!! can merge these loops because NGLLX = NGLLY = NGLLZ          enddo
-
-!!! can merge these loops because NGLLX = NGLLY = NGLLZ          do l=1,NGLLZ
-            tempx3l = tempx3l + tempx3(i,j,l) * hprimewgll_zz(l,k)
-          enddo
-
-! sum contributions from each element to the global mesh and add gravity term
-
-          iglob = ibool(i,j,k,ispec)
-          accelfluid(iglob) = accelfluid(iglob) - (wgllwgll_yz(j,k)*tempx1l + wgllwgll_xz(i,k)*tempx2l + wgllwgll_xy(i,j)*tempx3l)
-
-        enddo
-      enddo
-    enddo
-
-  enddo   ! spectral element loop
-
-  end subroutine compute_forces_OC
-

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/create_chunk_buffers.F90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/create_chunk_buffers.F90	2008-09-17 17:33:29 UTC (rev 12909)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/create_chunk_buffers.F90	2008-09-17 18:17:25 UTC (rev 12910)
@@ -140,7 +140,7 @@
   integer ichunk_receive,iproc_xi_receive,iproc_eta_receive
   integer iproc_loop,iproc_xi_loop,iproc_eta_loop
   integer iproc_xi_loop_inv,iproc_eta_loop_inv
-  integer imember_corner,npoin2D_value_received
+  integer imember_corner
 
   integer iregion_code
 
@@ -160,6 +160,7 @@
   integer :: sender,receiver
   integer, dimension(MPI_STATUS_SIZE) :: msg_status
   integer :: ier
+  integer :: npoin2D_value_received
 #endif
 
 ! current message number

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/create_regions_mesh.F90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/create_regions_mesh.F90	2008-09-17 17:33:29 UTC (rev 12909)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/create_regions_mesh.F90	2008-09-17 18:17:25 UTC (rev 12910)
@@ -725,6 +725,7 @@
   iboun(:,:) = .false.
   iMPIcut_xi(:,:) = .false.
   iMPIcut_eta(:,:) = .false.
+  is_on_a_slice_edge(:) = .false.
 
 !! DK DK added this for merged version
 ! creating mass matrix in this slice (will be fully assembled in the solver)
@@ -839,9 +840,6 @@
       iboun(5,ispec)= .true.
   endif
 
-  is_on_a_slice_edge(ispec) = iMPIcut_xi(1,ispec) .or. iMPIcut_xi(2,ispec) .or. &
-      iMPIcut_eta(1,ispec) .or. iMPIcut_eta(2,ispec) .or. iboun(5,ispec) .or. iboun(6,ispec)
-
 ! define the doubling flag of this element
      if(iregion_code /= IREGION_OUTER_CORE) idoubling(ispec) = doubling_index(ilayer)
 
@@ -1026,9 +1024,6 @@
   if (ilayer==ifirst_layer) iboun(6,ispec)= iboun_sb(ispec_superbrick,6)
   if (ilayer==ilast_layer .and. iz_elem==1) iboun(5,ispec)= iboun_sb(ispec_superbrick,5)
 
-  is_on_a_slice_edge(ispec) = iMPIcut_xi(1,ispec) .or. iMPIcut_xi(2,ispec) .or. &
-      iMPIcut_eta(1,ispec) .or. iMPIcut_eta(2,ispec) .or. iboun(5,ispec) .or. iboun(6,ispec)
-
 ! define the doubling flag of this element
      if(iregion_code /= IREGION_OUTER_CORE) idoubling(ispec) = doubling_index(ilayer)
 
@@ -1170,9 +1165,6 @@
       if (iproc_eta == NPROC_ETA-1) iboun(4,ispec)= .true.
   endif
 
-  is_on_a_slice_edge(ispec) = iMPIcut_xi(1,ispec) .or. iMPIcut_xi(2,ispec) .or. &
-      iMPIcut_eta(1,ispec) .or. iMPIcut_eta(2,ispec) .or. iboun(5,ispec) .or. iboun(6,ispec)
-
 ! define the doubling flag of this element
 ! only two active central cubes, the four others are fictitious
 
@@ -1239,6 +1231,20 @@
 ! check total number of spectral elements created
   if(ispec /= nspec) call exit_MPI(myrank,'ispec should equal nspec')
 
+! if any of these flags is true, the element is on a communication edge
+! this is not enough because it can also be in contact by an edge or a corner but not a full face
+! therefore we will have to fix array "is_on_a_slice_edge" later to take this into account
+  is_on_a_slice_edge(:) = &
+      iMPIcut_xi(1,:) .or. iMPIcut_xi(2,:) .or. &
+      iMPIcut_eta(1,:) .or. iMPIcut_eta(2,:) .or. &
+      iboun(1,:) .or. iboun(2,:) .or. &
+      iboun(3,:) .or. iboun(4,:) .or. &
+      iboun(5,:) .or. iboun(6,:)
+
+! no need to count fictitious elements on the edges
+! for which communications cannot be overlapped with calculations
+  where(idoubling == IFLAG_IN_FICTITIOUS_CUBE) is_on_a_slice_edge = .false.
+
 ! only create global addressing and the MPI buffers in the first pass
   if(ipass == 1) then
 

Added: seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/debug_with_opendx.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/debug_with_opendx.f90	                        (rev 0)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/debug_with_opendx.f90	2008-09-17 18:17:25 UTC (rev 12910)
@@ -0,0 +1,273 @@
+
+  subroutine debug_with_opendx(is_on_a_slice_edge_crust_mantle,xstore_crust_mantle, &
+                              ystore_crust_mantle,zstore_crust_mantle,ibool_crust_mantle)
+
+  implicit none
+
+  include "constants.h"
+
+!! DK DK for the merged version
+! include values created by the mesher
+  include "values_from_mesher.h"
+
+  logical, dimension(NSPEC_CRUST_MANTLE) :: is_on_a_slice_edge_crust_mantle
+
+! mesh parameters
+  integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE) :: ibool_crust_mantle
+
+  real(kind=CUSTOM_REAL), dimension(NGLOB_CRUST_MANTLE) :: &
+        xstore_crust_mantle,ystore_crust_mantle,zstore_crust_mantle
+
+  integer :: ntotspecAVS_DX,ioffset,ntotpoinAVS_DX,i,j,k,ispec,iglob
+  real :: xval,yval,zval
+
+!! DK DK create an OpenDX file showing all the elements computed in the first step
+!! DK DK of non blocking overlapping
+
+ ntotspecAVS_DX = count(is_on_a_slice_edge_crust_mantle(:))
+ ntotpoinAVS_DX = 8 * ntotspecAVS_DX
+
+    open(unit=11,file='DX_fullmesh.dx',status='unknown')
+    write(11,*) 'object 1 class array type float rank 1 shape 3 items ',ntotpoinAVS_DX,' data follows'
+  do ispec=1,nspec_crust_mantle
+if(is_on_a_slice_edge_crust_mantle(ispec)) then
+
+i = 1
+j = 1
+k = 1
+iglob = ibool_crust_mantle(i,j,k,ispec)
+call rthetaphi_2_xyz(xval,yval,zval,xstore_crust_mantle(iglob), &
+    ystore_crust_mantle(iglob),zstore_crust_mantle(iglob))
+        write(11,"(f10.7,1x,f10.7,1x,f10.7)") xval,yval,zval
+
+i = NGLLX
+j = 1
+k = 1
+iglob = ibool_crust_mantle(i,j,k,ispec)
+call rthetaphi_2_xyz(xval,yval,zval,xstore_crust_mantle(iglob), &
+    ystore_crust_mantle(iglob),zstore_crust_mantle(iglob))
+        write(11,"(f10.7,1x,f10.7,1x,f10.7)") xval,yval,zval
+
+i = NGLLX
+j = NGLLX
+k = 1
+iglob = ibool_crust_mantle(i,j,k,ispec)
+call rthetaphi_2_xyz(xval,yval,zval,xstore_crust_mantle(iglob), &
+    ystore_crust_mantle(iglob),zstore_crust_mantle(iglob))
+        write(11,"(f10.7,1x,f10.7,1x,f10.7)") xval,yval,zval
+
+i = 1
+j = NGLLX
+k = 1
+iglob = ibool_crust_mantle(i,j,k,ispec)
+call rthetaphi_2_xyz(xval,yval,zval,xstore_crust_mantle(iglob), &
+    ystore_crust_mantle(iglob),zstore_crust_mantle(iglob))
+        write(11,"(f10.7,1x,f10.7,1x,f10.7)") xval,yval,zval
+
+i = 1
+j = 1
+k = NGLLZ
+iglob = ibool_crust_mantle(i,j,k,ispec)
+call rthetaphi_2_xyz(xval,yval,zval,xstore_crust_mantle(iglob), &
+    ystore_crust_mantle(iglob),zstore_crust_mantle(iglob))
+        write(11,"(f10.7,1x,f10.7,1x,f10.7)") xval,yval,zval
+
+i = NGLLX
+j = 1
+k = NGLLZ
+iglob = ibool_crust_mantle(i,j,k,ispec)
+call rthetaphi_2_xyz(xval,yval,zval,xstore_crust_mantle(iglob), &
+    ystore_crust_mantle(iglob),zstore_crust_mantle(iglob))
+        write(11,"(f10.7,1x,f10.7,1x,f10.7)") xval,yval,zval
+
+i = NGLLX
+j = NGLLX
+k = NGLLZ
+iglob = ibool_crust_mantle(i,j,k,ispec)
+call rthetaphi_2_xyz(xval,yval,zval,xstore_crust_mantle(iglob), &
+    ystore_crust_mantle(iglob),zstore_crust_mantle(iglob))
+        write(11,"(f10.7,1x,f10.7,1x,f10.7)") xval,yval,zval
+
+i = 1
+j = NGLLX
+k = NGLLZ
+iglob = ibool_crust_mantle(i,j,k,ispec)
+call rthetaphi_2_xyz(xval,yval,zval,xstore_crust_mantle(iglob), &
+    ystore_crust_mantle(iglob),zstore_crust_mantle(iglob))
+        write(11,"(f10.7,1x,f10.7,1x,f10.7)") xval,yval,zval
+
+endif
+  enddo
+
+! ************* generate elements ******************
+
+  write(11,*) 'object 2 class array type int rank 1 shape 8 items ',ntotspecAVS_DX,' data follows'
+
+! read local elements in this slice and output global AVS or DX elements
+ioffset = 0
+  do ispec=1,nspec_crust_mantle
+if(is_on_a_slice_edge_crust_mantle(ispec)) then
+! point order in OpenDX is 4,1,8,5,3,2,7,6, *not* 1,2,3,4,5,6,7,8 as in AVS
+! in the case of OpenDX, node numbers start at zero
+write(11,"(i6,1x,i6,1x,i6,1x,i6,1x,i6,1x,i6,1x,i6,1x,i6)") &
+            ioffset+3,ioffset+0,ioffset+7,ioffset+4,ioffset+2,ioffset+1,ioffset+6,ioffset+5
+ioffset = ioffset + 8
+endif
+  enddo
+
+
+! ************* generate element data values ******************
+
+! output AVS or DX header for data
+    write(11,*) 'attribute "element type" string "cubes"'
+    write(11,*) 'attribute "ref" string "positions"'
+    write(11,*) 'object 3 class array type float rank 0 items ',ntotspecAVS_DX,' data follows'
+
+! read local elements in this slice and output global AVS or DX elements
+  do ispec=1,nspec_crust_mantle
+if(is_on_a_slice_edge_crust_mantle(ispec)) then
+        write(11,*) '100'
+endif
+  enddo
+
+! define OpenDX field
+    write(11,*) 'attribute "dep" string "connections"'
+    write(11,*) 'object "irregular positions irregular connections" class field'
+    write(11,*) 'component "positions" value 1'
+    write(11,*) 'component "connections" value 2'
+    write(11,*) 'component "data" value 3'
+    write(11,*) 'end'
+
+  close(11)
+
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+
+!! DK DK create an OpenDX file showing all the elements that seems to have a problem
+!! DK DK in the second step of non blocking overlapping
+
+!! DK DK suppressed for now
+ goto 777
+
+ ntotspecAVS_DX = 2
+ ntotpoinAVS_DX = 8 * ntotspecAVS_DX
+
+    open(unit=11,file='DX_fullmesh2.dx',status='unknown')
+    write(11,*) 'object 1 class array type float rank 1 shape 3 items ',ntotpoinAVS_DX,' data follows'
+  do ispec=1,nspec_crust_mantle
+if(ispec == 201 .or. ispec == 203) then
+
+i = 1
+j = 1
+k = 1
+iglob = ibool_crust_mantle(i,j,k,ispec)
+call rthetaphi_2_xyz(xval,yval,zval,xstore_crust_mantle(iglob), &
+    ystore_crust_mantle(iglob),zstore_crust_mantle(iglob))
+        write(11,"(f10.7,1x,f10.7,1x,f10.7)") xval,yval,zval
+
+i = NGLLX
+j = 1
+k = 1
+iglob = ibool_crust_mantle(i,j,k,ispec)
+call rthetaphi_2_xyz(xval,yval,zval,xstore_crust_mantle(iglob), &
+    ystore_crust_mantle(iglob),zstore_crust_mantle(iglob))
+        write(11,"(f10.7,1x,f10.7,1x,f10.7)") xval,yval,zval
+
+i = NGLLX
+j = NGLLX
+k = 1
+iglob = ibool_crust_mantle(i,j,k,ispec)
+call rthetaphi_2_xyz(xval,yval,zval,xstore_crust_mantle(iglob), &
+    ystore_crust_mantle(iglob),zstore_crust_mantle(iglob))
+        write(11,"(f10.7,1x,f10.7,1x,f10.7)") xval,yval,zval
+
+i = 1
+j = NGLLX
+k = 1
+iglob = ibool_crust_mantle(i,j,k,ispec)
+call rthetaphi_2_xyz(xval,yval,zval,xstore_crust_mantle(iglob), &
+    ystore_crust_mantle(iglob),zstore_crust_mantle(iglob))
+        write(11,"(f10.7,1x,f10.7,1x,f10.7)") xval,yval,zval
+
+i = 1
+j = 1
+k = NGLLZ
+iglob = ibool_crust_mantle(i,j,k,ispec)
+call rthetaphi_2_xyz(xval,yval,zval,xstore_crust_mantle(iglob), &
+    ystore_crust_mantle(iglob),zstore_crust_mantle(iglob))
+        write(11,"(f10.7,1x,f10.7,1x,f10.7)") xval,yval,zval
+
+i = NGLLX
+j = 1
+k = NGLLZ
+iglob = ibool_crust_mantle(i,j,k,ispec)
+call rthetaphi_2_xyz(xval,yval,zval,xstore_crust_mantle(iglob), &
+    ystore_crust_mantle(iglob),zstore_crust_mantle(iglob))
+        write(11,"(f10.7,1x,f10.7,1x,f10.7)") xval,yval,zval
+
+i = NGLLX
+j = NGLLX
+k = NGLLZ
+iglob = ibool_crust_mantle(i,j,k,ispec)
+call rthetaphi_2_xyz(xval,yval,zval,xstore_crust_mantle(iglob), &
+    ystore_crust_mantle(iglob),zstore_crust_mantle(iglob))
+        write(11,"(f10.7,1x,f10.7,1x,f10.7)") xval,yval,zval
+
+i = 1
+j = NGLLX
+k = NGLLZ
+iglob = ibool_crust_mantle(i,j,k,ispec)
+call rthetaphi_2_xyz(xval,yval,zval,xstore_crust_mantle(iglob), &
+    ystore_crust_mantle(iglob),zstore_crust_mantle(iglob))
+        write(11,"(f10.7,1x,f10.7,1x,f10.7)") xval,yval,zval
+
+endif
+  enddo
+
+! ************* generate elements ******************
+
+  write(11,*) 'object 2 class array type int rank 1 shape 8 items ',ntotspecAVS_DX,' data follows'
+
+! read local elements in this slice and output global AVS or DX elements
+ioffset = 0
+  do ispec=1,nspec_crust_mantle
+if(ispec == 201 .or. ispec == 203) then
+! point order in OpenDX is 4,1,8,5,3,2,7,6, *not* 1,2,3,4,5,6,7,8 as in AVS
+! in the case of OpenDX, node numbers start at zero
+write(11,"(i6,1x,i6,1x,i6,1x,i6,1x,i6,1x,i6,1x,i6,1x,i6)") &
+            ioffset+3,ioffset+0,ioffset+7,ioffset+4,ioffset+2,ioffset+1,ioffset+6,ioffset+5
+ioffset = ioffset + 8
+endif
+  enddo
+
+
+! ************* generate element data values ******************
+
+! output AVS or DX header for data
+    write(11,*) 'attribute "element type" string "cubes"'
+    write(11,*) 'attribute "ref" string "positions"'
+    write(11,*) 'object 3 class array type float rank 0 items ',ntotspecAVS_DX,' data follows'
+
+! read local elements in this slice and output global AVS or DX elements
+  do ispec=1,nspec_crust_mantle
+if(ispec == 201 .or. ispec == 203) then
+        write(11,*) '200'
+endif
+  enddo
+
+! define OpenDX field
+    write(11,*) 'attribute "dep" string "connections"'
+    write(11,*) 'object "irregular positions irregular connections" class field'
+    write(11,*) 'component "positions" value 1'
+    write(11,*) 'component "connections" value 2'
+    write(11,*) 'component "data" value 3'
+    write(11,*) 'end'
+
+  close(11)
+
+  777 continue
+
+  end subroutine debug_with_opendx
+

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/exit_mpi.F90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/exit_mpi.F90	2008-09-17 17:33:29 UTC (rev 12909)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/exit_mpi.F90	2008-09-17 18:17:25 UTC (rev 12910)
@@ -57,7 +57,7 @@
 
 ! write error message to file
   call get_value_string(OUTPUT_FILES, 'OUTPUT_FILES', 'OUTPUT_FILES')
-  write(outputname,"('/error_message',i6.6,'.txt')") myrank
+  write(outputname,"('/error_message_from_rank_',i6.6,'.txt')") myrank
   open(unit=IERROR,file=trim(OUTPUT_FILES)//outputname,status='unknown',action='write')
   write(IERROR,*) error_msg(1:len(error_msg))
   write(IERROR,*) 'Error detected, aborting MPI... proc ',myrank

Added: seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/fix_non_blocking_arrays.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/fix_non_blocking_arrays.f90	                        (rev 0)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/fix_non_blocking_arrays.f90	2008-09-17 18:17:25 UTC (rev 12910)
@@ -0,0 +1,89 @@
+!=====================================================================
+!
+!          S p e c f e m 3 D  G l o b e  V e r s i o n  4 . 1
+!          --------------------------------------------------
+!
+!          Main authors: Dimitri Komatitsch and Jeroen Tromp
+!    Seismological Laboratory, California Institute of Technology, USA
+!             and University of Pau / CNRS / INRIA, France
+! (c) California Institute of Technology and University of Pau / CNRS / INRIA
+!                            August 2008
+!
+! This program is free software; you can redistribute it and/or modify
+! it under the terms of the GNU General Public License as published by
+! the Free Software Foundation; either version 2 of the License, or
+! (at your option) any later version.
+!
+! This program is distributed in the hope that it will be useful,
+! but WITHOUT ANY WARRANTY; without even the implied warranty of
+! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+! GNU General Public License for more details.
+!
+! You should have received a copy of the GNU General Public License along
+! with this program; if not, write to the Free Software Foundation, Inc.,
+! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+!
+!=====================================================================
+
+!! DK DK fix the non-blocking arrays to assemble inside the chunks: elements
+!! DK DK in contact with the MPI faces by an edge or a corner only but not
+!! DK DK a full face are missing, therefore let us add them
+  subroutine fix_non_blocking_arrays(is_on_a_slice_edge,iboolright_xi, &
+         iboolleft_xi,iboolright_eta,iboolleft_eta, &
+         npoin2D_xi,npoin2D_eta,ibool, &
+         mask_ibool,nspec,nglob,NGLOB2DMAX_XMIN_XMAX,NGLOB2DMAX_YMIN_YMAX)
+
+  implicit none
+
+  include "constants.h"
+
+  integer :: npoin2D_xi,npoin2D_eta,nspec,nglob,NGLOB2DMAX_XMIN_XMAX,NGLOB2DMAX_YMIN_YMAX
+
+  logical, dimension(nspec) :: is_on_a_slice_edge
+
+  integer, dimension(NGLOB2DMAX_XMIN_XMAX) :: iboolleft_xi,iboolright_xi
+  integer, dimension(NGLOB2DMAX_YMIN_YMAX) :: iboolleft_eta,iboolright_eta
+
+  integer, dimension(NGLLX,NGLLY,NGLLZ,nspec) :: ibool
+
+! this mask is declared as integer in the calling program because it is used elsewhere
+! to store integers, and it is reused here as a logical to save memory
+  logical, dimension(nglob) :: mask_ibool
+
+  integer :: ipoin,ispec,i,j,k
+
+! clean the mask
+  mask_ibool(:) = .false.
+
+! mark all the points that are in the MPI buffers to assemble inside each chunk
+  do ipoin = 1,npoin2D_xi
+    mask_ibool(iboolright_xi(ipoin)) = .true.
+    mask_ibool(iboolleft_xi(ipoin)) = .true.
+  enddo
+
+  do ipoin = 1,npoin2D_eta
+    mask_ibool(iboolright_eta(ipoin)) = .true.
+    mask_ibool(iboolleft_eta(ipoin)) = .true.
+  enddo
+
+! now label all the elements that have at least one corner belonging
+! to any of these buffers as elements that must contribute to the
+! first step of the calculations (performed on the edges before starting
+! the non-blocking communications); there is no need to examine the inside
+! of the elements, checking their eight corners is sufficient
+  do ispec = 1,nspec
+    do k = 1,NGLLZ,NGLLZ-1
+      do j  = 1,NGLLY,NGLLY-1
+        do i = 1,NGLLX,NGLLX-1
+          if(mask_ibool(ibool(i,j,k,ispec))) then
+            is_on_a_slice_edge(ispec) = .true.
+            goto 888
+          endif
+        enddo
+      enddo
+    enddo
+  888 continue
+  enddo
+
+  end subroutine fix_non_blocking_arrays
+

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/main_program.F90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/main_program.F90	2008-09-17 17:33:29 UTC (rev 12909)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/main_program.F90	2008-09-17 18:17:25 UTC (rev 12910)
@@ -419,7 +419,7 @@
 
   equivalence(mask_ibool,     displ_crust_mantle)
 
-! because NSPEC_OUTER_CORE is always an even number, we can put
+! because NSPEC_CRUST_MANTLE is always an even number, we can put
 ! two single-precision arrays in each double-precision array.
 ! this does *NOT* work if double precision is turned on because
 ! the size of an integer does not correspond to the size of a double.
@@ -526,6 +526,26 @@
   npoin2D_max_all = max(maxval(npoin2D_xi_crust_mantle(:) + npoin2D_xi_inner_core(:)), &
                         maxval(npoin2D_eta_crust_mantle(:) + npoin2D_eta_inner_core(:)))
 
+!! DK DK fix the non-blocking arrays to assemble inside the chunks: elements
+!! DK DK in contact with the MPI faces by an edge or a corner only but not
+!! DK DK a full face are missing, therefore let us add them
+#ifdef USE_MPI
+  call fix_non_blocking_arrays(is_on_a_slice_edge_crust_mantle,iboolright_xi_crust_mantle, &
+         iboolleft_xi_crust_mantle,iboolright_eta_crust_mantle,iboolleft_eta_crust_mantle, &
+         npoin2D_xi_crust_mantle(1),npoin2D_eta_crust_mantle(1),ibool_crust_mantle, &
+         mask_ibool,NSPEC_CRUST_MANTLE,NGLOB_CRUST_MANTLE,NGLOB2DMAX_XMIN_XMAX_CM,NGLOB2DMAX_YMIN_YMAX_CM)
+
+  call fix_non_blocking_arrays(is_on_a_slice_edge_outer_core,iboolright_xi_outer_core, &
+         iboolleft_xi_outer_core,iboolright_eta_outer_core,iboolleft_eta_outer_core, &
+         npoin2D_xi_outer_core(1),npoin2D_eta_outer_core(1),ibool_outer_core, &
+         mask_ibool,NSPEC_OUTER_CORE,NGLOB_OUTER_CORE,NGLOB2DMAX_XMIN_XMAX_OC,NGLOB2DMAX_YMIN_YMAX_OC)
+
+  call fix_non_blocking_arrays(is_on_a_slice_edge_inner_core,iboolright_xi_inner_core, &
+         iboolleft_xi_inner_core,iboolright_eta_inner_core,iboolleft_eta_inner_core, &
+         npoin2D_xi_inner_core(1),npoin2D_eta_inner_core(1),ibool_inner_core, &
+         mask_ibool,NSPEC_INNER_CORE,NGLOB_INNER_CORE,NGLOB2DMAX_XMIN_XMAX_IC,NGLOB2DMAX_YMIN_YMAX_IC)
+#endif
+
 !! DK DK for the merged version, solver inserted here
   call specfem3D(myrank,sizeprocs,ichunk_slice,iproc_xi_slice,iproc_eta_slice,NSOURCES, &
   NTSTEP_BETWEEN_OUTPUT_SEISMOS,ibool_crust_mantle,ibool_outer_core,ibool_inner_core, &

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/read_compute_parameters.F90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/read_compute_parameters.F90	2008-09-17 17:33:29 UTC (rev 12909)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/read_compute_parameters.F90	2008-09-17 18:17:25 UTC (rev 12910)
@@ -219,9 +219,9 @@
     NPROC_ETA = NPROC_XI
   endif
 
-! support for only one slice per chunk has been discontinued when there is more than one chunk
+! support for only one slice per chunk has been discontinued
 ! because it induces topological problems, and we are not interested in using small meshes
-  if(NCHUNKS > 1 .and. (NPROC_XI == 1 .or. NPROC_ETA == 1)) stop 'support for only one slice per chunk has been discontinued'
+  if(NPROC_XI == 1 .or. NPROC_ETA == 1) stop 'support for only one slice per chunk edge has been discontinued'
 
 ! define the velocity model
   call read_value_string(MODEL, 'model.name')

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/specfem3D.F90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/specfem3D.F90	2008-09-17 17:33:29 UTC (rev 12909)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/specfem3D.F90	2008-09-17 18:17:25 UTC (rev 12910)
@@ -78,7 +78,7 @@
   include "values_from_mesher.h"
 
 ! for non blocking communications
-  integer :: icall
+  integer :: icall,iphase
   real :: percentage_edge
   logical, dimension(NSPEC_CRUST_MANTLE) :: is_on_a_slice_edge_crust_mantle
   logical, dimension(NSPEC_OUTER_CORE) :: is_on_a_slice_edge_outer_core
@@ -160,6 +160,10 @@
   real(kind=CUSTOM_REAL) additional_term,force_normal_comp
 #endif
 
+! variable lengths for factor_common and one_minus_sum_beta
+  integer :: vx_crust_mantle,vy_crust_mantle,vz_crust_mantle,vnspec_crust_mantle
+  integer :: vx_inner_core,vy_inner_core,vz_inner_core,vnspec_inner_core
+
 ! arrays to couple with the fluid regions by pointwise matching
   integer, dimension(NSPEC2D_BOTTOM_OC) :: ibelm_bottom_outer_core
   integer, dimension(NSPEC2D_TOP_OC) :: ibelm_top_outer_core
@@ -192,6 +196,8 @@
 
 #ifdef USE_MPI
 
+  integer :: ipoin
+
 ! communication pattern for faces between chunks
   integer, dimension(NUMMSGS_FACES_VAL) :: iprocfrom_faces,iprocto_faces,imsg_type
 
@@ -443,7 +449,7 @@
   integer, dimension(NB_SQUARE_EDGES_ONEDIR) :: npoin2D_xi_inner_core,npoin2D_eta_inner_core
 #endif
 
-  integer ichunk,iproc_xi,iproc_eta
+  integer :: ichunk,iproc_xi,iproc_eta
   integer NPROC_ONE_DIRECTION
 
 ! maximum of the norm of the displacement and of the potential in the fluid
@@ -1650,14 +1656,34 @@
   call get_event_info_parallel(myrank,yr_SAC,jda_SAC,ho_SAC,mi_SAC,sec_SAC,t_cmt_SAC, &
                  elat_SAC,elon_SAC,depth_SAC,mb_SAC,ename_SAC,cmt_lat_SAC,cmt_lon_SAC,cmt_depth_SAC,cmt_hdur_SAC,NSOURCES_SAC)
 
+  if(.not. USE_NONBLOCKING_COMMS) then
+    is_on_a_slice_edge_crust_mantle(:) = .true.
+    is_on_a_slice_edge_outer_core(:) = .true.
+    is_on_a_slice_edge_inner_core(:) = .true.
+  endif
+
+  vx_crust_mantle = size(factor_common_crust_mantle,2)
+  vy_crust_mantle = size(factor_common_crust_mantle,3)
+  vz_crust_mantle = size(factor_common_crust_mantle,4)
+  vnspec_crust_mantle = size(factor_common_crust_mantle,5)
+
+  vx_inner_core = size(factor_common_inner_core,2)
+  vy_inner_core = size(factor_common_inner_core,3)
+  vz_inner_core = size(factor_common_inner_core,4)
+  vnspec_inner_core = size(factor_common_inner_core,5)
+
 ! define correct time steps if restart files
   if(NUMBER_OF_RUNS < 1 .or. NUMBER_OF_RUNS > 3) stop 'number of restart runs can be 1, 2 or 3'
   if(NUMBER_OF_THIS_RUN < 1 .or. NUMBER_OF_THIS_RUN > NUMBER_OF_RUNS) stop 'incorrect run number'
   if (SIMULATION_TYPE /= 1 .and. NUMBER_OF_RUNS /= 1) stop 'Only 1 run for SIMULATION_TYPE = 2/3'
 
-    it_begin = 1
-    it_end = NSTEP
+  it_begin = 1
+  it_end = NSTEP
 
+! initialize variables for writing seismograms
+  seismo_offset = it_begin - 1
+  seismo_current = 0
+
 !
 !   s t a r t   t i m e   i t e r a t i o n s
 !
@@ -1688,10 +1714,6 @@
   time_start = 0
 #endif
 
-! initialize variables for writing seismograms
-  seismo_offset = it_begin - 1
-  seismo_current = 0
-
 ! *********************************************************
 ! ************* MAIN LOOP OVER THE TIME STEPS *************
 ! *********************************************************
@@ -1973,14 +1995,28 @@
     time = (dble(it-1)*DT-t0)/scale_t
   endif
 
+  iphase = 0 ! do not start any non blocking communications at this stage
   icall = 1
   call compute_forces_OC(d_ln_density_dr_table, &
-         displ_outer_core,accel_outer_core,xstore_outer_core,ystore_outer_core,zstore_outer_core, &
-         xix_outer_core,xiy_outer_core,xiz_outer_core, &
-         etax_outer_core,etay_outer_core,etaz_outer_core, &
-         gammax_outer_core,gammay_outer_core,gammaz_outer_core, &
-         hprime_xx,hprime_yy,hprime_zz,hprimewgll_xx,hprimewgll_yy,hprimewgll_zz, &
-         wgllwgll_xy,wgllwgll_xz,wgllwgll_yz,ibool_outer_core,icall,is_on_a_slice_edge_outer_core)
+          displ_outer_core,accel_outer_core,xstore_outer_core,ystore_outer_core,zstore_outer_core, &
+          xix_outer_core,xiy_outer_core,xiz_outer_core, &
+          etax_outer_core,etay_outer_core,etaz_outer_core, &
+          gammax_outer_core,gammay_outer_core,gammaz_outer_core, &
+#ifdef USE_MPI
+          is_on_a_slice_edge_outer_core, &
+          myrank,iproc_xi,iproc_eta,ichunk,addressing, &
+          iboolleft_xi_outer_core,iboolright_xi_outer_core,iboolleft_eta_outer_core,iboolright_eta_outer_core, &
+          npoin2D_faces_outer_core,npoin2D_xi_outer_core,npoin2D_eta_outer_core, &
+          iboolfaces_outer_core,iboolcorner_outer_core, &
+          iprocfrom_faces,iprocto_faces,imsg_type, &
+          iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
+          buffer_send_faces,buffer_received_faces,npoin2D_max_all, &
+          buffer_send_chunkcorners_scalar,buffer_recv_chunkcorners_scalar, &
+          NUM_MSG_TYPES,iphase, &
+#endif
+          hprime_xx,hprime_yy,hprime_zz, &
+          hprimewgll_xx,hprimewgll_yy,hprimewgll_zz, &
+          wgllwgll_xy,wgllwgll_xz,wgllwgll_yz,ibool_outer_core,icall)
 
 #ifndef USE_MPI
 !! DK DK put a fictitious source in each region in the case of a serial test if needed
@@ -2099,20 +2135,55 @@
 
   endif
 
-  icall = 2
-  call compute_forces_OC(d_ln_density_dr_table, &
-         displ_outer_core,accel_outer_core,xstore_outer_core,ystore_outer_core,zstore_outer_core, &
-         xix_outer_core,xiy_outer_core,xiz_outer_core, &
-         etax_outer_core,etay_outer_core,etaz_outer_core, &
-         gammax_outer_core,gammay_outer_core,gammaz_outer_core, &
-         hprime_xx,hprime_yy,hprime_zz,hprimewgll_xx,hprimewgll_yy,hprimewgll_zz, &
-         wgllwgll_xy,wgllwgll_xz,wgllwgll_yz,ibool_outer_core,icall,is_on_a_slice_edge_outer_core)
+! outer core
+#ifdef USE_MPI
+  if(USE_NONBLOCKING_COMMS) then
+    iphase = 1 ! start the non blocking communications
+    call assemble_MPI_scalar(myrank,accel_outer_core,NGLOB_OUTER_CORE, &
+            iproc_xi,iproc_eta,ichunk,addressing, &
+            iboolleft_xi_outer_core,iboolright_xi_outer_core,iboolleft_eta_outer_core,iboolright_eta_outer_core, &
+            npoin2D_faces_outer_core,npoin2D_xi_outer_core,npoin2D_eta_outer_core, &
+            iboolfaces_outer_core,iboolcorner_outer_core, &
+            iprocfrom_faces,iprocto_faces,imsg_type, &
+            iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
+            buffer_send_faces,buffer_received_faces,npoin2D_max_all, &
+            buffer_send_chunkcorners_scalar,buffer_recv_chunkcorners_scalar, &
+            NUMMSGS_FACES,NUM_MSG_TYPES,NCORNERSCHUNKS, &
+            NPROC_XI,NPROC_ETA,NGLOB1D_RADIAL(IREGION_OUTER_CORE), &
+            NGLOB2DMAX_XMIN_XMAX(IREGION_OUTER_CORE),NGLOB2DMAX_YMIN_YMAX(IREGION_OUTER_CORE),NGLOB2DMAX_XY_VAL_OC,NCHUNKS,iphase)
+  endif
+#endif
 
+#ifdef USE_MPI
+  if(USE_NONBLOCKING_COMMS) then
+    icall = 2
+    call compute_forces_OC(d_ln_density_dr_table, &
+          displ_outer_core,accel_outer_core,xstore_outer_core,ystore_outer_core,zstore_outer_core, &
+          xix_outer_core,xiy_outer_core,xiz_outer_core, &
+          etax_outer_core,etay_outer_core,etaz_outer_core, &
+          gammax_outer_core,gammay_outer_core,gammaz_outer_core, &
+          is_on_a_slice_edge_outer_core, &
+          myrank,iproc_xi,iproc_eta,ichunk,addressing, &
+          iboolleft_xi_outer_core,iboolright_xi_outer_core,iboolleft_eta_outer_core,iboolright_eta_outer_core, &
+          npoin2D_faces_outer_core,npoin2D_xi_outer_core,npoin2D_eta_outer_core, &
+          iboolfaces_outer_core,iboolcorner_outer_core, &
+          iprocfrom_faces,iprocto_faces,imsg_type, &
+          iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
+          buffer_send_faces,buffer_received_faces,npoin2D_max_all, &
+          buffer_send_chunkcorners_scalar,buffer_recv_chunkcorners_scalar, &
+          NUM_MSG_TYPES,iphase, &
+          hprime_xx,hprime_yy,hprime_zz, &
+          hprimewgll_xx,hprimewgll_yy,hprimewgll_zz, &
+          wgllwgll_xy,wgllwgll_xz,wgllwgll_yz,ibool_outer_core,icall)
+  endif
+#endif
+
 ! assemble all the contributions between slices using MPI
-
 ! outer core
 #ifdef USE_MPI
-  call assemble_MPI_scalar(myrank,accel_outer_core,NGLOB_OUTER_CORE, &
+  if(USE_NONBLOCKING_COMMS) then
+    do while (iphase <= 5) ! make sure the last communications are finished and processed
+      call assemble_MPI_scalar(myrank,accel_outer_core,NGLOB_OUTER_CORE, &
             iproc_xi,iproc_eta,ichunk,addressing, &
             iboolleft_xi_outer_core,iboolright_xi_outer_core,iboolleft_eta_outer_core,iboolright_eta_outer_core, &
             npoin2D_faces_outer_core,npoin2D_xi_outer_core,npoin2D_eta_outer_core, &
@@ -2123,7 +2194,22 @@
             buffer_send_chunkcorners_scalar,buffer_recv_chunkcorners_scalar, &
             NUMMSGS_FACES,NUM_MSG_TYPES,NCORNERSCHUNKS, &
             NPROC_XI,NPROC_ETA,NGLOB1D_RADIAL(IREGION_OUTER_CORE), &
+            NGLOB2DMAX_XMIN_XMAX(IREGION_OUTER_CORE),NGLOB2DMAX_YMIN_YMAX(IREGION_OUTER_CORE),NGLOB2DMAX_XY_VAL_OC,NCHUNKS,iphase)
+    enddo
+  else
+    call assemble_MPI_scalar_block(myrank,accel_outer_core,NGLOB_OUTER_CORE, &
+            iproc_xi,iproc_eta,ichunk,addressing, &
+            iboolleft_xi_outer_core,iboolright_xi_outer_core,iboolleft_eta_outer_core,iboolright_eta_outer_core, &
+            npoin2D_faces_outer_core,npoin2D_xi_outer_core,npoin2D_eta_outer_core, &
+            iboolfaces_outer_core,iboolcorner_outer_core, &
+            iprocfrom_faces,iprocto_faces,imsg_type, &
+            iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
+            buffer_send_faces,buffer_received_faces,npoin2D_max_all, &
+            buffer_send_chunkcorners_scalar,buffer_recv_chunkcorners_scalar, &
+            NUMMSGS_FACES,NUM_MSG_TYPES,NCORNERSCHUNKS, &
+            NPROC_XI,NPROC_ETA,NGLOB1D_RADIAL(IREGION_OUTER_CORE), &
             NGLOB2DMAX_XMIN_XMAX(IREGION_OUTER_CORE),NGLOB2DMAX_YMIN_YMAX(IREGION_OUTER_CORE),NGLOB2DMAX_XY_VAL_OC,NCHUNKS)
+  endif
 #endif
 
 ! multiply by the inverse of the mass matrix and update velocity
@@ -2140,6 +2226,7 @@
 
 ! for anisotropy and gravity, x y and z contain r theta and phi
 
+  iphase = 0 ! do not start any non blocking communications at this stage
   icall = 1
   call compute_forces_CM_IC(displ_crust_mantle,accel_crust_mantle, &
           xstore_crust_mantle,ystore_crust_mantle,zstore_crust_mantle, &
@@ -2148,24 +2235,33 @@
           kappavstore_crust_mantle,kappahstore_crust_mantle,muvstore_crust_mantle,muhstore_crust_mantle, &
           eta_anisostore_crust_mantle, &
           ibool_crust_mantle,idoubling_crust_mantle,R_memory_crust_mantle,epsilondev_crust_mantle,one_minus_sum_beta_crust_mantle, &
-          factor_common_crust_mantle,size(factor_common_crust_mantle,2), size(factor_common_crust_mantle,3), &
-          size(factor_common_crust_mantle,4), size(factor_common_crust_mantle,5), &
-          is_on_a_slice_edge_crust_mantle, &
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+          factor_common_crust_mantle,vx_crust_mantle,vy_crust_mantle,vz_crust_mantle,vnspec_crust_mantle, &
           displ_inner_core,accel_inner_core, &
           xix_inner_core,xiy_inner_core,xiz_inner_core,etax_inner_core,etay_inner_core,etaz_inner_core, &
           gammax_inner_core,gammay_inner_core,gammaz_inner_core, &
           kappavstore_inner_core,muvstore_inner_core,ibool_inner_core,idoubling_inner_core, &
           R_memory_inner_core,epsilondev_inner_core,&
           one_minus_sum_beta_inner_core,factor_common_inner_core, &
-          size(factor_common_inner_core,2), size(factor_common_inner_core,3), &
-          size(factor_common_inner_core,4), size(factor_common_inner_core,5), &
-          is_on_a_slice_edge_inner_core, &
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+          vx_inner_core,vy_inner_core,vz_inner_core,vnspec_inner_core, &
           hprime_xx,hprime_yy,hprime_zz, &
           hprimewgll_xx,hprimewgll_yy,hprimewgll_zz, &
           wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, &
           alphaval,betaval,gammaval, &
+#ifdef USE_MPI
+            is_on_a_slice_edge_crust_mantle,is_on_a_slice_edge_inner_core, &
+            myrank,iproc_xi,iproc_eta,ichunk,addressing, &
+            iboolleft_xi_crust_mantle,iboolright_xi_crust_mantle,iboolleft_eta_crust_mantle,iboolright_eta_crust_mantle, &
+            npoin2D_faces_crust_mantle,npoin2D_xi_crust_mantle,npoin2D_eta_crust_mantle, &
+            iboolfaces_crust_mantle,iboolcorner_crust_mantle, &
+            iboolleft_xi_inner_core,iboolright_xi_inner_core,iboolleft_eta_inner_core,iboolright_eta_inner_core, &
+            npoin2D_faces_inner_core,npoin2D_xi_inner_core,npoin2D_eta_inner_core, &
+            iboolfaces_inner_core,iboolcorner_inner_core, &
+            iprocfrom_faces,iprocto_faces,imsg_type, &
+            iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
+            buffer_send_faces,buffer_received_faces,npoin2D_max_all, &
+            buffer_send_chunkcorners_vector,buffer_recv_chunkcorners_vector, &
+            NUM_MSG_TYPES,iphase, &
+#endif
           COMPUTE_AND_STORE_STRAIN,AM_V,icall)
 
 #ifdef USE_MPI
@@ -2325,40 +2421,96 @@
 
     endif
 
-  icall = 2
-  call compute_forces_CM_IC(displ_crust_mantle,accel_crust_mantle, &
+! assemble all the contributions between slices using MPI
+! crust/mantle and inner core handled in the same call
+! in order to reduce the number of MPI messages by 2
+#ifdef USE_MPI
+  if(USE_NONBLOCKING_COMMS) then
+    iphase = 1 ! start the non blocking communications
+    call assemble_MPI_vector(myrank,accel_crust_mantle,accel_inner_core, &
+            iproc_xi,iproc_eta,ichunk,addressing, &
+            iboolleft_xi_crust_mantle,iboolright_xi_crust_mantle,iboolleft_eta_crust_mantle,iboolright_eta_crust_mantle, &
+            npoin2D_faces_crust_mantle,npoin2D_xi_crust_mantle(1),npoin2D_eta_crust_mantle(1), &
+            iboolfaces_crust_mantle,iboolcorner_crust_mantle, &
+            iboolleft_xi_inner_core,iboolright_xi_inner_core,iboolleft_eta_inner_core,iboolright_eta_inner_core, &
+            npoin2D_faces_inner_core,npoin2D_xi_inner_core(1),npoin2D_eta_inner_core(1), &
+            iboolfaces_inner_core,iboolcorner_inner_core, &
+            iprocfrom_faces,iprocto_faces,imsg_type, &
+            iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
+            buffer_send_faces,buffer_received_faces,npoin2D_max_all, &
+            buffer_send_chunkcorners_vector,buffer_recv_chunkcorners_vector, &
+            NUMMSGS_FACES,NUM_MSG_TYPES,NCORNERSCHUNKS, &
+            NPROC_XI,NPROC_ETA,NGLOB1D_RADIAL(IREGION_CRUST_MANTLE), &
+            NGLOB1D_RADIAL(IREGION_INNER_CORE),NCHUNKS,iphase)
+  endif
+#endif
+
+  if(DEBUG_NONBLOCKING_COMMS) accel_crust_mantle = 1.e27
+
+#ifdef USE_MPI
+  if(USE_NONBLOCKING_COMMS) then
+    icall = 2
+    call compute_forces_CM_IC(displ_crust_mantle,accel_crust_mantle, &
           xstore_crust_mantle,ystore_crust_mantle,zstore_crust_mantle, &
           xix_crust_mantle,xiy_crust_mantle,xiz_crust_mantle,etax_crust_mantle,etay_crust_mantle,etaz_crust_mantle, &
           gammax_crust_mantle,gammay_crust_mantle,gammaz_crust_mantle, &
           kappavstore_crust_mantle,kappahstore_crust_mantle,muvstore_crust_mantle,muhstore_crust_mantle, &
           eta_anisostore_crust_mantle, &
           ibool_crust_mantle,idoubling_crust_mantle,R_memory_crust_mantle,epsilondev_crust_mantle,one_minus_sum_beta_crust_mantle, &
-          factor_common_crust_mantle,size(factor_common_crust_mantle,2), size(factor_common_crust_mantle,3), &
-          size(factor_common_crust_mantle,4), size(factor_common_crust_mantle,5), &
-          is_on_a_slice_edge_crust_mantle, &
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+          factor_common_crust_mantle,vx_crust_mantle,vy_crust_mantle,vz_crust_mantle,vnspec_crust_mantle, &
           displ_inner_core,accel_inner_core, &
           xix_inner_core,xiy_inner_core,xiz_inner_core,etax_inner_core,etay_inner_core,etaz_inner_core, &
           gammax_inner_core,gammay_inner_core,gammaz_inner_core, &
           kappavstore_inner_core,muvstore_inner_core,ibool_inner_core,idoubling_inner_core, &
           R_memory_inner_core,epsilondev_inner_core,&
           one_minus_sum_beta_inner_core,factor_common_inner_core, &
-          size(factor_common_inner_core,2), size(factor_common_inner_core,3), &
-          size(factor_common_inner_core,4), size(factor_common_inner_core,5), &
-          is_on_a_slice_edge_inner_core, &
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+          vx_inner_core,vy_inner_core,vz_inner_core,vnspec_inner_core, &
           hprime_xx,hprime_yy,hprime_zz, &
           hprimewgll_xx,hprimewgll_yy,hprimewgll_zz, &
           wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, &
           alphaval,betaval,gammaval, &
+            is_on_a_slice_edge_crust_mantle,is_on_a_slice_edge_inner_core, &
+            myrank,iproc_xi,iproc_eta,ichunk,addressing, &
+            iboolleft_xi_crust_mantle,iboolright_xi_crust_mantle,iboolleft_eta_crust_mantle,iboolright_eta_crust_mantle, &
+            npoin2D_faces_crust_mantle,npoin2D_xi_crust_mantle,npoin2D_eta_crust_mantle, &
+            iboolfaces_crust_mantle,iboolcorner_crust_mantle, &
+            iboolleft_xi_inner_core,iboolright_xi_inner_core,iboolleft_eta_inner_core,iboolright_eta_inner_core, &
+            npoin2D_faces_inner_core,npoin2D_xi_inner_core,npoin2D_eta_inner_core, &
+            iboolfaces_inner_core,iboolcorner_inner_core, &
+            iprocfrom_faces,iprocto_faces,imsg_type, &
+            iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
+            buffer_send_faces,buffer_received_faces,npoin2D_max_all, &
+            buffer_send_chunkcorners_vector,buffer_recv_chunkcorners_vector, &
+            NUM_MSG_TYPES,iphase, &
           COMPUTE_AND_STORE_STRAIN,AM_V,icall)
+  endif
+#endif
 
+  if(DEBUG_USING_OPENDX .and. myrank == 0 .and. it == 2) &
+    call debug_with_opendx(is_on_a_slice_edge_crust_mantle,xstore_crust_mantle, &
+                              ystore_crust_mantle,zstore_crust_mantle,ibool_crust_mantle)
+
+#ifdef USE_MPI
+  if(DEBUG_NONBLOCKING_COMMS) then
+    do ipoin = 1,npoin2D_xi_crust_mantle(1)
+      if(minval(accel_crust_mantle(:,iboolright_xi_crust_mantle(ipoin))) < 1.e27) call exit_mpi(myrank,'error in new step 1')
+      if(minval(accel_crust_mantle(:,iboolleft_xi_crust_mantle(ipoin))) < 1.e27) call exit_mpi(myrank,'error in new step 2')
+      if(minval(accel_crust_mantle(:,iboolright_eta_crust_mantle(ipoin))) < 1.e27) then
+        print *,'myrank, maxval_abs,ibool = ',myrank, &
+          minval(accel_crust_mantle(:,iboolright_eta_crust_mantle(ipoin))),iboolright_eta_crust_mantle(ipoin)
+        call exit_mpi(myrank,'error in new step 3')
+      endif
+      if(minval(accel_crust_mantle(:,iboolleft_eta_crust_mantle(ipoin))) < 1.e27) call exit_mpi(myrank,'error in new step 4')
+    enddo
+    call exit_mpi(myrank,'everything went well in DEBUG_NONBLOCKING_COMMS, no overlap detected')
+  endif
+
 ! assemble all the contributions between slices using MPI
-
 ! crust/mantle and inner core handled in the same call
 ! in order to reduce the number of MPI messages by 2
-#ifdef USE_MPI
-  call assemble_MPI_vector(myrank,accel_crust_mantle,accel_inner_core, &
+  if(USE_NONBLOCKING_COMMS) then
+    do while (iphase <= 5) ! make sure the last communications are finished and processed
+      call assemble_MPI_vector(myrank,accel_crust_mantle,accel_inner_core, &
             iproc_xi,iproc_eta,ichunk,addressing, &
             iboolleft_xi_crust_mantle,iboolright_xi_crust_mantle,iboolleft_eta_crust_mantle,iboolright_eta_crust_mantle, &
             npoin2D_faces_crust_mantle,npoin2D_xi_crust_mantle(1),npoin2D_eta_crust_mantle(1), &
@@ -2372,7 +2524,25 @@
             buffer_send_chunkcorners_vector,buffer_recv_chunkcorners_vector, &
             NUMMSGS_FACES,NUM_MSG_TYPES,NCORNERSCHUNKS, &
             NPROC_XI,NPROC_ETA,NGLOB1D_RADIAL(IREGION_CRUST_MANTLE), &
+            NGLOB1D_RADIAL(IREGION_INNER_CORE),NCHUNKS,iphase)
+    enddo
+  else
+    call assemble_MPI_vector_block(myrank,accel_crust_mantle,accel_inner_core, &
+            iproc_xi,iproc_eta,ichunk,addressing, &
+            iboolleft_xi_crust_mantle,iboolright_xi_crust_mantle,iboolleft_eta_crust_mantle,iboolright_eta_crust_mantle, &
+            npoin2D_faces_crust_mantle,npoin2D_xi_crust_mantle(1),npoin2D_eta_crust_mantle(1), &
+            iboolfaces_crust_mantle,iboolcorner_crust_mantle, &
+            iboolleft_xi_inner_core,iboolright_xi_inner_core,iboolleft_eta_inner_core,iboolright_eta_inner_core, &
+            npoin2D_faces_inner_core,npoin2D_xi_inner_core(1),npoin2D_eta_inner_core(1), &
+            iboolfaces_inner_core,iboolcorner_inner_core, &
+            iprocfrom_faces,iprocto_faces,imsg_type, &
+            iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
+            buffer_send_faces,buffer_received_faces,npoin2D_max_all, &
+            buffer_send_chunkcorners_vector,buffer_recv_chunkcorners_vector, &
+            NUMMSGS_FACES,NUM_MSG_TYPES,NCORNERSCHUNKS, &
+            NPROC_XI,NPROC_ETA,NGLOB1D_RADIAL(IREGION_CRUST_MANTLE), &
             NGLOB1D_RADIAL(IREGION_INNER_CORE),NCHUNKS)
+  endif
 #endif
 
 !---