[cig-commits] r17932 - seismo/3D/SPECFEM3D_GLOBE/trunk

Mon Feb 21 17:39:53 PST 2011

Author: dkomati1
Date: 2011-02-21 17:39:52 -0800 (Mon, 21 Feb 2011)
New Revision: 17932

Modified:
   seismo/3D/SPECFEM3D_GLOBE/trunk/assemble_MPI_central_cube.f90
   seismo/3D/SPECFEM3D_GLOBE/trunk/assemble_MPI_vector.f90
   seismo/3D/SPECFEM3D_GLOBE/trunk/compute_forces_crust_mantle.f90
   seismo/3D/SPECFEM3D_GLOBE/trunk/compute_forces_crust_mantle_Dev.f90
   seismo/3D/SPECFEM3D_GLOBE/trunk/compute_forces_inner_core.f90
   seismo/3D/SPECFEM3D_GLOBE/trunk/compute_forces_inner_core_Dev.f90
   seismo/3D/SPECFEM3D_GLOBE/trunk/compute_forces_outer_core_Dev.f90
   seismo/3D/SPECFEM3D_GLOBE/trunk/specfem3D.f90
Log:
added non blocking MPI for crust_mantle, inner_core and central_cube for the forward problem;
i.e., done adding non blocking MPI for the whole forward problem.


Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/assemble_MPI_central_cube.f90
===================================================================

--- seismo/3D/SPECFEM3D_GLOBE/trunk/assemble_MPI_central_cube.f90	2011-02-22 01:39:25 UTC (rev 17931)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/assemble_MPI_central_cube.f90	2011-02-22 01:39:52 UTC (rev 17932)
@@ -25,29 +25,30 @@
 !
 !=====================================================================
 
-subroutine assemble_MPI_central_cube(ichunk,nb_msgs_theor_in_cube, sender_from_slices_to_cube, &
-  npoin2D_cube_from_slices, buffer_all_cube_from_slices, buffer_slices, buffer_slices2, ibool_central_cube, &
-  receiver_cube_from_slices, ibool_inner_core, idoubling_inner_core, NSPEC_INNER_CORE, &
-  ibelm_bottom_inner_core, NSPEC2D_BOTTOM_INNER_CORE,NGLOB_INNER_CORE,vector_assemble,ndim_assemble)
+  subroutine assemble_MPI_central_cube(ichunk,nb_msgs_theor_in_cube,sender_from_slices_to_cube, &
+    npoin2D_cube_from_slices,buffer_all_cube_from_slices,buffer_slices,ibool_central_cube, &
+    receiver_cube_from_slices,ibool_inner_core,idoubling_inner_core, &
+    ibelm_bottom_inner_core,NSPEC2D_BOTTOM_INNER_CORE,vector_assemble,ndim_assemble,iphase_CC)
 
-! this version of the routine is based on non blocking MPI calls
-
   implicit none
 
 ! standard include of the MPI library
   include 'mpif.h'
   include 'constants.h'
 
+! include values created by the mesher
+  include "OUTPUT_FILES/values_from_mesher.h"
+
 ! for matching with central cube in inner core
-  integer ichunk, nb_msgs_theor_in_cube, npoin2D_cube_from_slices
+  integer ichunk, nb_msgs_theor_in_cube, npoin2D_cube_from_slices,iphase_CC
   integer, dimension(nb_msgs_theor_in_cube) :: sender_from_slices_to_cube
-  double precision, dimension(npoin2D_cube_from_slices,NDIM) :: buffer_slices,buffer_slices2
-  double precision, dimension(nb_msgs_theor_in_cube,npoin2D_cube_from_slices,NDIM) :: buffer_all_cube_from_slices
+  double precision, dimension(npoin2D_cube_from_slices,ndim_assemble) :: buffer_slices
+  double precision, dimension(npoin2D_cube_from_slices,ndim_assemble,nb_msgs_theor_in_cube) :: buffer_all_cube_from_slices
   integer, dimension(nb_msgs_theor_in_cube,npoin2D_cube_from_slices):: ibool_central_cube
   integer receiver_cube_from_slices
 
 ! local to global mapping
-  integer NSPEC_INNER_CORE,NSPEC2D_BOTTOM_INNER_CORE, NGLOB_INNER_CORE
+  integer NSPEC2D_BOTTOM_INNER_CORE
   integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE) :: ibool_inner_core
   integer, dimension(NSPEC_INNER_CORE) :: idoubling_inner_core
   integer, dimension(NSPEC2D_BOTTOM_INNER_CORE) :: ibelm_bottom_inner_core
@@ -63,62 +64,76 @@
   real(kind=CUSTOM_REAL), dimension(NGLOB_INNER_CORE) :: array_central_cube
 
 ! MPI status of messages to be received
-  integer msg_status(MPI_STATUS_SIZE), ier
+  integer, save :: request_send,request_receive
+! maximum value of nb_msgs_theor_in_cube is 5 (when NPROC_XI == 1)
+! therefore NPROC_XI+4 is always large enough
+  integer, dimension(NPROC_XI_VAL+4), save :: request_send_array,request_receive_array
+  logical :: flag_result_test
+  integer, dimension(MPI_STATUS_SIZE) :: msg_status
+  integer :: ier
 
 ! mask
   logical, dimension(NGLOB_INNER_CORE) :: mask
 
 !---
-!---  now use buffers to assemble mass matrix with central cube once and for all
+!---  use buffers to assemble mass matrix with central cube once and for all
 !---
 
+  if(iphase_CC == 1) then
+
 ! on chunks AB and AB_ANTIPODE, receive all the messages from slices
   if(ichunk == CHUNK_AB .or. ichunk == CHUNK_AB_ANTIPODE) then
-
     do imsg = 1,nb_msgs_theor_in_cube-1
-
 ! receive buffers from slices
-    sender = sender_from_slices_to_cube(imsg)
-    call MPI_RECV(buffer_slices, &
+      sender = sender_from_slices_to_cube(imsg)
+      call MPI_IRECV(buffer_all_cube_from_slices(:,:,imsg), &
                 ndim_assemble*npoin2D_cube_from_slices,MPI_DOUBLE_PRECISION,sender, &
-                itag,MPI_COMM_WORLD,msg_status,ier)
-
-! copy buffer in 2D array for each slice
-    buffer_all_cube_from_slices(imsg,:,1:ndim_assemble) = buffer_slices(:,1:ndim_assemble)
-
+                itag,MPI_COMM_WORLD,request_receive_array(imsg),ier)
     enddo
   endif
 
 ! send info to central cube from all the slices except those in CHUNK_AB & CHUNK_AB_ANTIPODE
   if(ichunk /= CHUNK_AB .and. ichunk /= CHUNK_AB_ANTIPODE) then
-
 ! for bottom elements in contact with central cube from the slices side
     ipoin = 0
     do ispec2D = 1,NSPEC2D_BOTTOM_INNER_CORE
-
       ispec = ibelm_bottom_inner_core(ispec2D)
-
 ! only for DOFs exactly on surface of central cube (bottom of these elements)
       k = 1
       do j = 1,NGLLY
         do i = 1,NGLLX
           ipoin = ipoin + 1
-          buffer_slices(ipoin,1:ndim_assemble) = dble(vector_assemble(1:ndim_assemble,ibool_inner_core(i,j,k,ispec)))
+          buffer_slices(ipoin,:) = dble(vector_assemble(1:ndim_assemble,ibool_inner_core(i,j,k,ispec)))
         enddo
       enddo
     enddo
-
 ! send buffer to central cube
     receiver = receiver_cube_from_slices
-    call MPI_SEND(buffer_slices,ndim_assemble*npoin2D_cube_from_slices, &
-              MPI_DOUBLE_PRECISION,receiver,itag,MPI_COMM_WORLD,ier)
-
+    call MPI_ISSEND(buffer_slices,ndim_assemble*npoin2D_cube_from_slices, &
+              MPI_DOUBLE_PRECISION,receiver,itag,MPI_COMM_WORLD,request_send,ier)
  endif  ! end sending info to central cube
 
+  iphase_CC = iphase_CC + 1
+  return ! exit because we have started some communications therefore we need some time
 
+  endif !!!!!!!!! end of iphase_CC 1
+
+  if(iphase_CC == 2) then
+
+  if(ichunk /= CHUNK_AB .and. ichunk /= CHUNK_AB_ANTIPODE) then
+    call MPI_TEST(request_send,flag_result_test,msg_status,ier)
+    if(.not. flag_result_test) return ! exit if message not sent yet
+  endif
+
+  if(ichunk == CHUNK_AB .or. ichunk == CHUNK_AB_ANTIPODE) then
+    do imsg = 1,nb_msgs_theor_in_cube-1
+      call MPI_TEST(request_receive_array(imsg),flag_result_test,msg_status,ier)
+      if(.not. flag_result_test) return ! exit if message not received yet
+    enddo
+  endif
+
 ! exchange of their bottom faces between chunks AB and AB_ANTIPODE
   if(ichunk == CHUNK_AB .or. ichunk == CHUNK_AB_ANTIPODE) then
-
     ipoin = 0
     do ispec = NSPEC_INNER_CORE, 1, -1
       if (idoubling_inner_core(ispec) == IFLAG_BOTTOM_CENTRAL_CUBE) then
@@ -126,26 +141,41 @@
         do j = 1,NGLLY
           do i = 1,NGLLX
             ipoin = ipoin + 1
-            buffer_slices(ipoin,1:ndim_assemble) = dble(vector_assemble(1:ndim_assemble,ibool_inner_core(i,j,k,ispec)))
+            buffer_slices(ipoin,:) = dble(vector_assemble(1:ndim_assemble,ibool_inner_core(i,j,k,ispec)))
           enddo
         enddo
       endif
     enddo
-
     sender = sender_from_slices_to_cube(nb_msgs_theor_in_cube)
+!   call MPI_SENDRECV(buffer_slices,ndim_assemble*npoin2D_cube_from_slices,MPI_DOUBLE_PRECISION,receiver_cube_from_slices, &
+!       itag,buffer_slices2,ndim_assemble*npoin2D_cube_from_slices,&
+!       MPI_DOUBLE_PRECISION,sender,itag,MPI_COMM_WORLD,msg_status,ier)
 
-    call MPI_SENDRECV(buffer_slices,ndim_assemble*npoin2D_cube_from_slices,MPI_DOUBLE_PRECISION,receiver_cube_from_slices, &
-        itag,buffer_slices2,ndim_assemble*npoin2D_cube_from_slices,&
-        MPI_DOUBLE_PRECISION,sender,itag,MPI_COMM_WORLD,msg_status,ier)
+    call MPI_IRECV(buffer_all_cube_from_slices(:,:,nb_msgs_theor_in_cube), &
+        ndim_assemble*npoin2D_cube_from_slices,MPI_DOUBLE_PRECISION,sender,itag,MPI_COMM_WORLD,request_receive,ier)
+!! DK DK this merged with previous statement
+!   buffer_all_cube_from_slices(:,:,nb_msgs_theor_in_cube) = buffer_slices2(:,:)
 
-   buffer_all_cube_from_slices(nb_msgs_theor_in_cube,:,1:ndim_assemble) = buffer_slices2(:,1:ndim_assemble)
-
+    call MPI_ISSEND(buffer_slices,ndim_assemble*npoin2D_cube_from_slices,MPI_DOUBLE_PRECISION,receiver_cube_from_slices, &
+        itag,MPI_COMM_WORLD,request_send,ier)
   endif
 
+  iphase_CC = iphase_CC + 1
+  return ! exit because we have started some communications therefore we need some time
+
+  endif !!!!!!!!! end of iphase_CC 2
+
+  if(iphase_CC == 3) then
+
 !--- now we need to assemble the contributions
 
   if(ichunk == CHUNK_AB .or. ichunk == CHUNK_AB_ANTIPODE) then
 
+    call MPI_TEST(request_send,flag_result_test,msg_status,ier)
+    if(.not. flag_result_test) return ! exit if message not sent yet
+    call MPI_TEST(request_receive,flag_result_test,msg_status,ier)
+    if(.not. flag_result_test) return ! exit if message not received yet
+
     do idimension = 1,ndim_assemble
 ! erase contributions to central cube array
       array_central_cube(:) = 0._CUSTOM_REAL
@@ -155,9 +185,9 @@
       do imsg = 1,nb_msgs_theor_in_cube-1
         do ipoin = 1,npoin2D_cube_from_slices
           if(CUSTOM_REAL == SIZE_REAL) then
-            array_central_cube(ibool_central_cube(imsg,ipoin)) = sngl(buffer_all_cube_from_slices(imsg,ipoin,idimension))
+            array_central_cube(ibool_central_cube(imsg,ipoin)) = sngl(buffer_all_cube_from_slices(ipoin,idimension,imsg))
           else
-            array_central_cube(ibool_central_cube(imsg,ipoin)) = buffer_all_cube_from_slices(imsg,ipoin,idimension)
+            array_central_cube(ibool_central_cube(imsg,ipoin)) = buffer_all_cube_from_slices(ipoin,idimension,imsg)
           endif
         enddo
       enddo
@@ -169,11 +199,11 @@
           if(CUSTOM_REAL == SIZE_REAL) then
             array_central_cube(ibool_central_cube(nb_msgs_theor_in_cube,ipoin)) = &
             array_central_cube(ibool_central_cube(nb_msgs_theor_in_cube,ipoin)) + &
-            sngl(buffer_all_cube_from_slices(nb_msgs_theor_in_cube,ipoin,idimension))
+            sngl(buffer_all_cube_from_slices(ipoin,idimension,nb_msgs_theor_in_cube))
           else
             array_central_cube(ibool_central_cube(nb_msgs_theor_in_cube,ipoin)) = &
             array_central_cube(ibool_central_cube(nb_msgs_theor_in_cube,ipoin)) + &
-            buffer_all_cube_from_slices(nb_msgs_theor_in_cube,ipoin,idimension)
+            buffer_all_cube_from_slices(ipoin,idimension,nb_msgs_theor_in_cube)
           endif
           mask(ibool_central_cube(nb_msgs_theor_in_cube,ipoin)) = .true.
         endif
@@ -197,7 +227,7 @@
 ! copy sum back
       do imsg = 1,nb_msgs_theor_in_cube-1
         do ipoin = 1,npoin2D_cube_from_slices
-          buffer_all_cube_from_slices(imsg,ipoin,idimension) = vector_assemble(idimension,ibool_central_cube(imsg,ipoin))
+          buffer_all_cube_from_slices(ipoin,idimension,imsg) = vector_assemble(idimension,ibool_central_cube(imsg,ipoin))
         enddo
       enddo
 
@@ -209,55 +239,88 @@
 
 ! receive info from central cube on all the slices except those in CHUNK_AB & CHUNK_AB_ANTIPODE
   if(ichunk /= CHUNK_AB .and. ichunk /= CHUNK_AB_ANTIPODE) then
-
 ! receive buffers from slices
   sender = receiver_cube_from_slices
-  call MPI_RECV(buffer_slices, &
+  call MPI_IRECV(buffer_slices, &
               ndim_assemble*npoin2D_cube_from_slices,MPI_DOUBLE_PRECISION,sender, &
-              itag,MPI_COMM_WORLD,msg_status,ier)
+              itag,MPI_COMM_WORLD,request_receive,ier)
+! for bottom elements in contact with central cube from the slices side
+!   ipoin = 0
+!   do ispec2D = 1,NSPEC2D_BOTTOM_INNER_CORE
+!     ispec = ibelm_bottom_inner_core(ispec2D)
+! only for DOFs exactly on surface of central cube (bottom of these elements)
+!     k = 1
+!     do j = 1,NGLLY
+!       do i = 1,NGLLX
+!         ipoin = ipoin + 1
+! distinguish between single and double precision for reals
+!         if(CUSTOM_REAL == SIZE_REAL) then
+!           vector_assemble(:,ibool_inner_core(i,j,k,ispec)) = sngl(buffer_slices(ipoin,:))
+!         else
+!           vector_assemble(:,ibool_inner_core(i,j,k,ispec)) = buffer_slices(ipoin,:)
+!         endif
+!       enddo
+!     enddo
+!   enddo
+ endif  ! end receiving info from central cube
 
+!------- send info back from central cube to slices
+
+! on chunk AB & CHUNK_AB_ANTIPODE, send all the messages to slices
+  if(ichunk == CHUNK_AB .or. ichunk == CHUNK_AB_ANTIPODE) then
+    do imsg = 1,nb_msgs_theor_in_cube-1
+! send buffers to slices
+      receiver = sender_from_slices_to_cube(imsg)
+      call MPI_ISSEND(buffer_all_cube_from_slices(:,:,imsg),ndim_assemble*npoin2D_cube_from_slices, &
+              MPI_DOUBLE_PRECISION,receiver,itag,MPI_COMM_WORLD,request_send_array(imsg),ier)
+    enddo
+  endif
+
+  iphase_CC = iphase_CC + 1
+  return ! exit because we have started some communications therefore we need some time
+
+  endif !!!!!!!!! end of iphase_CC 3
+
+  if(iphase_CC == 4) then
+
+  if(ichunk == CHUNK_AB .or. ichunk == CHUNK_AB_ANTIPODE) then
+    do imsg = 1,nb_msgs_theor_in_cube-1
+      call MPI_TEST(request_send_array(imsg),flag_result_test,msg_status,ier)
+      if(.not. flag_result_test) return ! exit if message not sent yet
+    enddo
+  endif
+
+  if(ichunk /= CHUNK_AB .and. ichunk /= CHUNK_AB_ANTIPODE) then
+    call MPI_TEST(request_receive,flag_result_test,msg_status,ier)
+    if(.not. flag_result_test) return ! exit if message not received yet
+  endif
+
+! receive info from central cube on all the slices except those in CHUNK_AB & CHUNK_AB_ANTIPODE
+  if(ichunk /= CHUNK_AB .and. ichunk /= CHUNK_AB_ANTIPODE) then
 ! for bottom elements in contact with central cube from the slices side
     ipoin = 0
     do ispec2D = 1,NSPEC2D_BOTTOM_INNER_CORE
-
       ispec = ibelm_bottom_inner_core(ispec2D)
-
 ! only for DOFs exactly on surface of central cube (bottom of these elements)
       k = 1
       do j = 1,NGLLY
         do i = 1,NGLLX
           ipoin = ipoin + 1
-
 ! distinguish between single and double precision for reals
           if(CUSTOM_REAL == SIZE_REAL) then
-            vector_assemble(1:ndim_assemble,ibool_inner_core(i,j,k,ispec)) = sngl(buffer_slices(ipoin,1:ndim_assemble))
+            vector_assemble(1:ndim_assemble,ibool_inner_core(i,j,k,ispec)) = sngl(buffer_slices(ipoin,:))
           else
-            vector_assemble(1:ndim_assemble,ibool_inner_core(i,j,k,ispec)) = buffer_slices(ipoin,1:ndim_assemble)
+            vector_assemble(1:ndim_assemble,ibool_inner_core(i,j,k,ispec)) = buffer_slices(ipoin,:)
           endif
-
         enddo
       enddo
     enddo
-
  endif  ! end receiving info from central cube
 
-!------- send info back from central cube to slices
+! this is the exit condition, to go beyond the last phase number
+  iphase_CC = iphase_CC + 1
 
-! on chunk AB & CHUNK_AB_ANTIPODE, send all the messages to slices
-  if(ichunk == CHUNK_AB .or. ichunk == CHUNK_AB_ANTIPODE) then
+  endif !!!!!!!!! end of iphase_CC 4
 
-   do imsg = 1,nb_msgs_theor_in_cube-1
+  end subroutine assemble_MPI_central_cube
 
-! copy buffer in 2D array for each slice
-   buffer_slices(:,1:ndim_assemble) = buffer_all_cube_from_slices(imsg,:,1:ndim_assemble)
-
-! send buffers to slices
-    receiver = sender_from_slices_to_cube(imsg)
-    call MPI_SEND(buffer_slices,ndim_assemble*npoin2D_cube_from_slices, &
-              MPI_DOUBLE_PRECISION,receiver,itag,MPI_COMM_WORLD,ier)
-
-   enddo
-   endif
-
-end subroutine assemble_MPI_central_cube
-

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/assemble_MPI_vector.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/assemble_MPI_vector.f90	2011-02-22 01:39:25 UTC (rev 17931)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/assemble_MPI_vector.f90	2011-02-22 01:39:52 UTC (rev 17932)
@@ -31,9 +31,7 @@
 !---- to reduce the total number of MPI calls
 !----
 
-  subroutine assemble_MPI_vector(myrank, &
-            accel_crust_mantle,NGLOB_CRUST_MANTLE, &
-            accel_inner_core,NGLOB_INNER_CORE, &
+  subroutine assemble_MPI_vector(myrank,accel_crust_mantle,accel_inner_core, &
             iproc_xi,iproc_eta,ichunk,addressing, &
             iboolleft_xi_crust_mantle,iboolright_xi_crust_mantle,iboolleft_eta_crust_mantle,iboolright_eta_crust_mantle, &
             npoin2D_faces_crust_mantle,npoin2D_xi_crust_mantle,npoin2D_eta_crust_mantle, &
@@ -41,20 +39,14 @@
             iboolleft_xi_inner_core,iboolright_xi_inner_core,iboolleft_eta_inner_core,iboolright_eta_inner_core, &
             npoin2D_faces_inner_core,npoin2D_xi_inner_core,npoin2D_eta_inner_core, &
             iboolfaces_inner_core,iboolcorner_inner_core, &
-            iprocfrom_faces,iprocto_faces,imsg_type, &
+            iprocfrom_faces,iprocto_faces, &
             iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
-            buffer_send_faces_vector,buffer_received_faces_vector, &
+            buffer_send_faces_vector,buffer_received_faces_vector,npoin2D_max_all_CM_IC, &
             buffer_send_chunkcorners_vector,buffer_recv_chunkcorners_vector, &
-            NUMMSGS_FACES,NUM_MSG_TYPES,NCORNERSCHUNKS, &
-            NPROC_XI,NPROC_ETA, &
-            NGLOB1D_RADIAL_crust_mantle, &
-            NGLOB2DMAX_XMIN_XMAX_CM,NGLOB2DMAX_YMIN_YMAX_CM, &
-            NGLOB1D_RADIAL_inner_core, &
-            NGLOB2DMAX_XMIN_XMAX_IC,NGLOB2DMAX_YMIN_YMAX_IC, &
-            NGLOB2DMAX_XY,NCHUNKS)
+            NUMMSGS_FACES,NCORNERSCHUNKS, &
+            NPROC_XI,NPROC_ETA,NGLOB1D_RADIAL_crust_mantle, &
+            NGLOB1D_RADIAL_inner_core,NCHUNKS,iphase)
 
-! this version of the routine is based on non blocking MPI calls
-
   implicit none
 
 ! standard include of the MPI library
@@ -63,23 +55,25 @@
   include "constants.h"
   include "precision.h"
 
-  integer myrank,NGLOB_CRUST_MANTLE,NGLOB_INNER_CORE,NCHUNKS
+! include values created by the mesher
+  include "OUTPUT_FILES/values_from_mesher.h"
 
+  integer myrank,NCHUNKS,iphase
+
 ! the two arrays to assemble
   real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_CRUST_MANTLE) :: accel_crust_mantle
   real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_INNER_CORE) :: accel_inner_core
 
   integer iproc_xi,iproc_eta,ichunk
-  integer, dimension(NB_SQUARE_EDGES_ONEDIR) :: npoin2D_xi_crust_mantle,npoin2D_eta_crust_mantle
   integer npoin2D_faces_crust_mantle(NUMFACES_SHARED)
-  integer, dimension(NB_SQUARE_EDGES_ONEDIR) :: npoin2D_xi_inner_core,npoin2D_eta_inner_core
   integer npoin2D_faces_inner_core(NUMFACES_SHARED)
 
-  integer NGLOB2DMAX_XMIN_XMAX_CM,NGLOB2DMAX_YMIN_YMAX_CM,NGLOB1D_RADIAL_crust_mantle
-  integer NGLOB2DMAX_XMIN_XMAX_IC,NGLOB2DMAX_YMIN_YMAX_IC,NGLOB1D_RADIAL_inner_core
-  integer NPROC_XI,NPROC_ETA,NGLOB2DMAX_XY
-  integer NUMMSGS_FACES,NUM_MSG_TYPES,NCORNERSCHUNKS
+  integer, dimension(NB_SQUARE_EDGES_ONEDIR) :: npoin2D_xi_crust_mantle,npoin2D_eta_crust_mantle, &
+        npoin2D_xi_inner_core,npoin2D_eta_inner_core
 
+  integer NGLOB1D_RADIAL_crust_mantle,NGLOB1D_RADIAL_inner_core,NPROC_XI,NPROC_ETA
+  integer NUMMSGS_FACES,NCORNERSCHUNKS
+
 ! for addressing of the slices
   integer, dimension(NCHUNKS,0:NPROC_XI-1,0:NPROC_ETA-1) :: addressing
 
@@ -94,9 +88,11 @@
   integer, dimension(NGLOB1D_RADIAL_inner_core,NUMCORNERS_SHARED) :: iboolcorner_inner_core
   integer icount_corners
 
-  integer, dimension(NGLOB2DMAX_XY,NUMFACES_SHARED) :: iboolfaces_crust_mantle,iboolfaces_inner_core
-! size of buffers is multiplied by 2 because we handle two regions in the same MPI call
-  real(kind=CUSTOM_REAL), dimension(NDIM,2*NGLOB2DMAX_XY) :: buffer_send_faces_vector,buffer_received_faces_vector
+  integer :: npoin2D_max_all_CM_IC
+  integer, dimension(NGLOB2DMAX_XY_VAL,NUMFACES_SHARED) :: iboolfaces_crust_mantle
+  integer, dimension(NGLOB2DMAX_XY_VAL,NUMFACES_SHARED) :: iboolfaces_inner_core
+  real(kind=CUSTOM_REAL), dimension(NDIM,npoin2D_max_all_CM_IC,NUMFACES_SHARED) :: &
+      buffer_send_faces_vector,buffer_received_faces_vector
 
 ! buffers for send and receive between corners of the chunks
 ! size of buffers is the sum of two sizes because we handle two regions in the same MPI call
@@ -106,25 +102,40 @@
 ! ---- arrays to assemble between chunks
 
 ! communication pattern for faces between chunks
-  integer, dimension(NUMMSGS_FACES) :: iprocfrom_faces,iprocto_faces,imsg_type
+  integer, dimension(NUMMSGS_FACES) :: iprocfrom_faces,iprocto_faces
 
 ! communication pattern for corners between chunks
   integer, dimension(NCORNERSCHUNKS) :: iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners
 
 ! MPI status of messages to be received
-  integer msg_status(MPI_STATUS_SIZE)
+  integer, dimension(MPI_STATUS_SIZE) :: msg_status
 
-  integer ipoin,ipoin2D,ipoin1D
-  integer sender,receiver,ier
-  integer imsg,imsg_loop
-  integer icount_faces,npoin2D_chunks_all
+  integer :: ipoin,ipoin2D,ipoin1D
+  integer :: sender,receiver
+  integer :: imsg
+  integer :: icount_faces,npoin2D_chunks_all
 
-  integer :: NGLOB1D_RADIAL_all,ioffset
+  integer :: NGLOB1D_RADIAL_all
   integer, dimension(NB_SQUARE_EDGES_ONEDIR) :: npoin2D_xi_all,npoin2D_eta_all
+
+! do not remove the "save" statement because this routine is non blocking
+! therefore it needs to find the right value of ioffset when it re-enters
+! the routine later to perform the next communication step
+  integer, save :: ioffset
+
+  integer :: ier
+! do not remove the "save" statement because this routine is non blocking
+  integer, save :: request_send,request_receive
+  integer, dimension(NUMFACES_SHARED), save :: request_send_array,request_receive_array
+  logical :: flag_result_test
+
 ! $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
 
 ! check flag to see if we need to assemble (might be turned off when debugging)
-  if (.not. ACTUALLY_ASSEMBLE_MPI_SLICES) return
+  if (.not. ACTUALLY_ASSEMBLE_MPI_SLICES) then
+    iphase = 9999 ! this means everything is finished
+    return
+  endif
 
 ! here we have to assemble all the contributions between slices using MPI
 
@@ -140,23 +151,22 @@
 !---- first assemble along xi using the 2-D topology
 !----
 
-! assemble along xi only if more than one slice
-  if(NPROC_XI > 1) then
+  if(iphase == 1) then
 
 ! slices copy the right face into the buffer
+  do ipoin = 1,npoin2D_xi_crust_mantle(2)
+    buffer_send_faces_vector(1,ipoin,1) = accel_crust_mantle(1,iboolright_xi_crust_mantle(ipoin))
+    buffer_send_faces_vector(2,ipoin,1) = accel_crust_mantle(2,iboolright_xi_crust_mantle(ipoin))
+    buffer_send_faces_vector(3,ipoin,1) = accel_crust_mantle(3,iboolright_xi_crust_mantle(ipoin))
+  enddo
+
 ! the buffer for the inner core starts right after the buffer for the crust and mantle
   ioffset = npoin2D_xi_crust_mantle(2)
 
-  do ipoin = 1,npoin2D_xi_crust_mantle(2)
-    buffer_send_faces_vector(1,ipoin) = accel_crust_mantle(1,iboolright_xi_crust_mantle(ipoin))
-    buffer_send_faces_vector(2,ipoin) = accel_crust_mantle(2,iboolright_xi_crust_mantle(ipoin))
-    buffer_send_faces_vector(3,ipoin) = accel_crust_mantle(3,iboolright_xi_crust_mantle(ipoin))
-  enddo
-
   do ipoin = 1,npoin2D_xi_inner_core(2)
-    buffer_send_faces_vector(1,ioffset + ipoin) = accel_inner_core(1,iboolright_xi_inner_core(ipoin))
-    buffer_send_faces_vector(2,ioffset + ipoin) = accel_inner_core(2,iboolright_xi_inner_core(ipoin))
-    buffer_send_faces_vector(3,ioffset + ipoin) = accel_inner_core(3,iboolright_xi_inner_core(ipoin))
+    buffer_send_faces_vector(1,ioffset + ipoin,1) = accel_inner_core(1,iboolright_xi_inner_core(ipoin))
+    buffer_send_faces_vector(2,ioffset + ipoin,1) = accel_inner_core(2,iboolright_xi_inner_core(ipoin))
+    buffer_send_faces_vector(3,ioffset + ipoin,1) = accel_inner_core(3,iboolright_xi_inner_core(ipoin))
   enddo
 
 ! send messages forward along each row
@@ -170,31 +180,48 @@
   else
     receiver = addressing(ichunk,iproc_xi + 1,iproc_eta)
   endif
+  call MPI_IRECV(buffer_received_faces_vector,NDIM*npoin2D_xi_all(1),CUSTOM_MPI_TYPE,sender, &
+        itag,MPI_COMM_WORLD,request_receive,ier)
 
-  call MPI_SENDRECV(buffer_send_faces_vector,NDIM*npoin2D_xi_all(2),CUSTOM_MPI_TYPE,receiver, &
-        itag2,buffer_received_faces_vector,NDIM*npoin2D_xi_all(1),CUSTOM_MPI_TYPE,sender, &
-        itag,MPI_COMM_WORLD,msg_status,ier)
+  call MPI_ISSEND(buffer_send_faces_vector,NDIM*npoin2D_xi_all(2),CUSTOM_MPI_TYPE,receiver, &
+        itag2,MPI_COMM_WORLD,request_send,ier)
 
+  iphase = iphase + 1
+  return ! exit because we have started some communications therefore we need some time
+
+  endif !!!!!!!!! end of iphase 1
+
+  if(iphase == 2) then
+
+! call MPI_WAIT(request_send,msg_status,ier)
+! call MPI_WAIT(request_receive,msg_status,ier)
+  call MPI_TEST(request_send,flag_result_test,msg_status,ier)
+  if(.not. flag_result_test) return ! exit if message not sent yet
+  call MPI_TEST(request_receive,flag_result_test,msg_status,ier)
+  if(.not. flag_result_test) return ! exit if message not received yet
+
 ! all slices add the buffer received to the contributions on the left face
   if(iproc_xi > 0) then
 
   do ipoin = 1,npoin2D_xi_crust_mantle(1)
     accel_crust_mantle(1,iboolleft_xi_crust_mantle(ipoin)) = accel_crust_mantle(1,iboolleft_xi_crust_mantle(ipoin)) + &
-                              buffer_received_faces_vector(1,ipoin)
+                              buffer_received_faces_vector(1,ipoin,1)
     accel_crust_mantle(2,iboolleft_xi_crust_mantle(ipoin)) = accel_crust_mantle(2,iboolleft_xi_crust_mantle(ipoin)) + &
-                              buffer_received_faces_vector(2,ipoin)
+                              buffer_received_faces_vector(2,ipoin,1)
     accel_crust_mantle(3,iboolleft_xi_crust_mantle(ipoin)) = accel_crust_mantle(3,iboolleft_xi_crust_mantle(ipoin)) + &
-                              buffer_received_faces_vector(3,ipoin)
+                              buffer_received_faces_vector(3,ipoin,1)
   enddo
 
+! the buffer for the inner core starts right after the buffer for the crust and mantle
   ioffset = npoin2D_xi_crust_mantle(1)
+
   do ipoin = 1,npoin2D_xi_inner_core(1)
     accel_inner_core(1,iboolleft_xi_inner_core(ipoin)) = accel_inner_core(1,iboolleft_xi_inner_core(ipoin)) + &
-                              buffer_received_faces_vector(1,ioffset + ipoin)
+                              buffer_received_faces_vector(1,ioffset + ipoin,1)
     accel_inner_core(2,iboolleft_xi_inner_core(ipoin)) = accel_inner_core(2,iboolleft_xi_inner_core(ipoin)) + &
-                              buffer_received_faces_vector(2,ioffset + ipoin)
+                              buffer_received_faces_vector(2,ioffset + ipoin,1)
     accel_inner_core(3,iboolleft_xi_inner_core(ipoin)) = accel_inner_core(3,iboolleft_xi_inner_core(ipoin)) + &
-                              buffer_received_faces_vector(3,ioffset + ipoin)
+                              buffer_received_faces_vector(3,ioffset + ipoin,1)
   enddo
 
   endif
@@ -202,19 +229,19 @@
 ! the contributions are correctly assembled on the left side of each slice
 ! now we have to send the result back to the sender
 ! all slices copy the left face into the buffer
+  do ipoin = 1,npoin2D_xi_crust_mantle(1)
+    buffer_send_faces_vector(1,ipoin,1) = accel_crust_mantle(1,iboolleft_xi_crust_mantle(ipoin))
+    buffer_send_faces_vector(2,ipoin,1) = accel_crust_mantle(2,iboolleft_xi_crust_mantle(ipoin))
+    buffer_send_faces_vector(3,ipoin,1) = accel_crust_mantle(3,iboolleft_xi_crust_mantle(ipoin))
+  enddo
+
 ! the buffer for the inner core starts right after the buffer for the crust and mantle
   ioffset = npoin2D_xi_crust_mantle(1)
 
-  do ipoin = 1,npoin2D_xi_crust_mantle(1)
-    buffer_send_faces_vector(1,ipoin) = accel_crust_mantle(1,iboolleft_xi_crust_mantle(ipoin))
-    buffer_send_faces_vector(2,ipoin) = accel_crust_mantle(2,iboolleft_xi_crust_mantle(ipoin))
-    buffer_send_faces_vector(3,ipoin) = accel_crust_mantle(3,iboolleft_xi_crust_mantle(ipoin))
-  enddo
-
   do ipoin = 1,npoin2D_xi_inner_core(1)
-    buffer_send_faces_vector(1,ioffset + ipoin) = accel_inner_core(1,iboolleft_xi_inner_core(ipoin))
-    buffer_send_faces_vector(2,ioffset + ipoin) = accel_inner_core(2,iboolleft_xi_inner_core(ipoin))
-    buffer_send_faces_vector(3,ioffset + ipoin) = accel_inner_core(3,iboolleft_xi_inner_core(ipoin))
+    buffer_send_faces_vector(1,ioffset + ipoin,1) = accel_inner_core(1,iboolleft_xi_inner_core(ipoin))
+    buffer_send_faces_vector(2,ioffset + ipoin,1) = accel_inner_core(2,iboolleft_xi_inner_core(ipoin))
+    buffer_send_faces_vector(3,ioffset + ipoin,1) = accel_inner_core(3,iboolleft_xi_inner_core(ipoin))
   enddo
 
 ! send messages backward along each row
@@ -228,51 +255,64 @@
   else
     receiver = addressing(ichunk,iproc_xi - 1,iproc_eta)
   endif
-  call MPI_SENDRECV(buffer_send_faces_vector,NDIM*npoin2D_xi_all(1),CUSTOM_MPI_TYPE,receiver, &
-        itag2,buffer_received_faces_vector,NDIM*npoin2D_xi_all(2),CUSTOM_MPI_TYPE,sender, &
-        itag,MPI_COMM_WORLD,msg_status,ier)
+  call MPI_IRECV(buffer_received_faces_vector,NDIM*npoin2D_xi_all(2),CUSTOM_MPI_TYPE,sender, &
+        itag,MPI_COMM_WORLD,request_receive,ier)
 
+  call MPI_ISSEND(buffer_send_faces_vector,NDIM*npoin2D_xi_all(1),CUSTOM_MPI_TYPE,receiver, &
+        itag2,MPI_COMM_WORLD,request_send,ier)
+
+  iphase = iphase + 1
+  return ! exit because we have started some communications therefore we need some time
+
+  endif !!!!!!!!! end of iphase 2
+
+  if(iphase == 3) then
+
+! call MPI_WAIT(request_send,msg_status,ier)
+! call MPI_WAIT(request_receive,msg_status,ier)
+  call MPI_TEST(request_send,flag_result_test,msg_status,ier)
+  if(.not. flag_result_test) return ! exit if message not sent yet
+  call MPI_TEST(request_receive,flag_result_test,msg_status,ier)
+  if(.not. flag_result_test) return ! exit if message not received yet
+
 ! all slices copy the buffer received to the contributions on the right face
   if(iproc_xi < NPROC_XI-1) then
 
   do ipoin = 1,npoin2D_xi_crust_mantle(2)
-    accel_crust_mantle(1,iboolright_xi_crust_mantle(ipoin)) = buffer_received_faces_vector(1,ipoin)
-    accel_crust_mantle(2,iboolright_xi_crust_mantle(ipoin)) = buffer_received_faces_vector(2,ipoin)
-    accel_crust_mantle(3,iboolright_xi_crust_mantle(ipoin)) = buffer_received_faces_vector(3,ipoin)
+    accel_crust_mantle(1,iboolright_xi_crust_mantle(ipoin)) = buffer_received_faces_vector(1,ipoin,1)
+    accel_crust_mantle(2,iboolright_xi_crust_mantle(ipoin)) = buffer_received_faces_vector(2,ipoin,1)
+    accel_crust_mantle(3,iboolright_xi_crust_mantle(ipoin)) = buffer_received_faces_vector(3,ipoin,1)
   enddo
 
+! the buffer for the inner core starts right after the buffer for the crust and mantle
   ioffset = npoin2D_xi_crust_mantle(2)
+
   do ipoin = 1,npoin2D_xi_inner_core(2)
-    accel_inner_core(1,iboolright_xi_inner_core(ipoin)) = buffer_received_faces_vector(1,ioffset + ipoin)
-    accel_inner_core(2,iboolright_xi_inner_core(ipoin)) = buffer_received_faces_vector(2,ioffset + ipoin)
-    accel_inner_core(3,iboolright_xi_inner_core(ipoin)) = buffer_received_faces_vector(3,ioffset + ipoin)
+    accel_inner_core(1,iboolright_xi_inner_core(ipoin)) = buffer_received_faces_vector(1,ioffset + ipoin,1)
+    accel_inner_core(2,iboolright_xi_inner_core(ipoin)) = buffer_received_faces_vector(2,ioffset + ipoin,1)
+    accel_inner_core(3,iboolright_xi_inner_core(ipoin)) = buffer_received_faces_vector(3,ioffset + ipoin,1)
   enddo
 
   endif
 
-  endif
-
 !----
 !---- then assemble along eta using the 2-D topology
 !----
 
-! assemble along eta only if more than one slice
-  if(NPROC_ETA > 1) then
+! slices copy the right face into the buffer
+  do ipoin = 1,npoin2D_eta_crust_mantle(2)
+    buffer_send_faces_vector(1,ipoin,1) = accel_crust_mantle(1,iboolright_eta_crust_mantle(ipoin))
+    buffer_send_faces_vector(2,ipoin,1) = accel_crust_mantle(2,iboolright_eta_crust_mantle(ipoin))
+    buffer_send_faces_vector(3,ipoin,1) = accel_crust_mantle(3,iboolright_eta_crust_mantle(ipoin))
+  enddo
 
 ! the buffer for the inner core starts right after the buffer for the crust and mantle
   ioffset = npoin2D_eta_crust_mantle(2)
 
-! slices copy the right face into the buffer
-  do ipoin = 1,npoin2D_eta_crust_mantle(2)
-    buffer_send_faces_vector(1,ipoin) = accel_crust_mantle(1,iboolright_eta_crust_mantle(ipoin))
-    buffer_send_faces_vector(2,ipoin) = accel_crust_mantle(2,iboolright_eta_crust_mantle(ipoin))
-    buffer_send_faces_vector(3,ipoin) = accel_crust_mantle(3,iboolright_eta_crust_mantle(ipoin))
-  enddo
-
   do ipoin = 1,npoin2D_eta_inner_core(2)
-    buffer_send_faces_vector(1,ioffset + ipoin) = accel_inner_core(1,iboolright_eta_inner_core(ipoin))
-    buffer_send_faces_vector(2,ioffset + ipoin) = accel_inner_core(2,iboolright_eta_inner_core(ipoin))
-    buffer_send_faces_vector(3,ioffset + ipoin) = accel_inner_core(3,iboolright_eta_inner_core(ipoin))
+    buffer_send_faces_vector(1,ioffset + ipoin,1) = accel_inner_core(1,iboolright_eta_inner_core(ipoin))
+    buffer_send_faces_vector(2,ioffset + ipoin,1) = accel_inner_core(2,iboolright_eta_inner_core(ipoin))
+    buffer_send_faces_vector(3,ioffset + ipoin,1) = accel_inner_core(3,iboolright_eta_inner_core(ipoin))
   enddo
 
 ! send messages forward along each row
@@ -286,30 +326,48 @@
   else
     receiver = addressing(ichunk,iproc_xi,iproc_eta + 1)
   endif
-  call MPI_SENDRECV(buffer_send_faces_vector,NDIM*npoin2D_eta_all(2),CUSTOM_MPI_TYPE,receiver, &
-    itag2,buffer_received_faces_vector,NDIM*npoin2D_eta_all(1),CUSTOM_MPI_TYPE,sender, &
-    itag,MPI_COMM_WORLD,msg_status,ier)
+  call MPI_IRECV(buffer_received_faces_vector,NDIM*npoin2D_eta_all(1),CUSTOM_MPI_TYPE,sender, &
+    itag,MPI_COMM_WORLD,request_receive,ier)
 
+  call MPI_ISSEND(buffer_send_faces_vector,NDIM*npoin2D_eta_all(2),CUSTOM_MPI_TYPE,receiver, &
+    itag2,MPI_COMM_WORLD,request_send,ier)
+
+  iphase = iphase + 1
+  return ! exit because we have started some communications therefore we need some time
+
+  endif !!!!!!!!! end of iphase 3
+
+  if(iphase == 4) then
+
+! call MPI_WAIT(request_send,msg_status,ier)
+! call MPI_WAIT(request_receive,msg_status,ier)
+  call MPI_TEST(request_send,flag_result_test,msg_status,ier)
+  if(.not. flag_result_test) return ! exit if message not sent yet
+  call MPI_TEST(request_receive,flag_result_test,msg_status,ier)
+  if(.not. flag_result_test) return ! exit if message not received yet
+
 ! all slices add the buffer received to the contributions on the left face
   if(iproc_eta > 0) then
 
   do ipoin = 1,npoin2D_eta_crust_mantle(1)
     accel_crust_mantle(1,iboolleft_eta_crust_mantle(ipoin)) = accel_crust_mantle(1,iboolleft_eta_crust_mantle(ipoin)) + &
-                              buffer_received_faces_vector(1,ipoin)
+                              buffer_received_faces_vector(1,ipoin,1)
     accel_crust_mantle(2,iboolleft_eta_crust_mantle(ipoin)) = accel_crust_mantle(2,iboolleft_eta_crust_mantle(ipoin)) + &
-                              buffer_received_faces_vector(2,ipoin)
+                              buffer_received_faces_vector(2,ipoin,1)
     accel_crust_mantle(3,iboolleft_eta_crust_mantle(ipoin)) = accel_crust_mantle(3,iboolleft_eta_crust_mantle(ipoin)) + &
-                              buffer_received_faces_vector(3,ipoin)
+                              buffer_received_faces_vector(3,ipoin,1)
   enddo
 
+! the buffer for the inner core starts right after the buffer for the crust and mantle
   ioffset = npoin2D_eta_crust_mantle(1)
+
   do ipoin = 1,npoin2D_eta_inner_core(1)
     accel_inner_core(1,iboolleft_eta_inner_core(ipoin)) = accel_inner_core(1,iboolleft_eta_inner_core(ipoin)) + &
-                              buffer_received_faces_vector(1,ioffset + ipoin)
+                              buffer_received_faces_vector(1,ioffset + ipoin,1)
     accel_inner_core(2,iboolleft_eta_inner_core(ipoin)) = accel_inner_core(2,iboolleft_eta_inner_core(ipoin)) + &
-                              buffer_received_faces_vector(2,ioffset + ipoin)
+                              buffer_received_faces_vector(2,ioffset + ipoin,1)
     accel_inner_core(3,iboolleft_eta_inner_core(ipoin)) = accel_inner_core(3,iboolleft_eta_inner_core(ipoin)) + &
-                              buffer_received_faces_vector(3,ioffset + ipoin)
+                              buffer_received_faces_vector(3,ioffset + ipoin,1)
   enddo
 
   endif
@@ -317,19 +375,19 @@
 ! the contributions are correctly assembled on the left side of each slice
 ! now we have to send the result back to the sender
 ! all slices copy the left face into the buffer
+  do ipoin = 1,npoin2D_eta_crust_mantle(1)
+    buffer_send_faces_vector(1,ipoin,1) = accel_crust_mantle(1,iboolleft_eta_crust_mantle(ipoin))
+    buffer_send_faces_vector(2,ipoin,1) = accel_crust_mantle(2,iboolleft_eta_crust_mantle(ipoin))
+    buffer_send_faces_vector(3,ipoin,1) = accel_crust_mantle(3,iboolleft_eta_crust_mantle(ipoin))
+  enddo
+
 ! the buffer for the inner core starts right after the buffer for the crust and mantle
   ioffset = npoin2D_eta_crust_mantle(1)
 
-  do ipoin = 1,npoin2D_eta_crust_mantle(1)
-    buffer_send_faces_vector(1,ipoin) = accel_crust_mantle(1,iboolleft_eta_crust_mantle(ipoin))
-    buffer_send_faces_vector(2,ipoin) = accel_crust_mantle(2,iboolleft_eta_crust_mantle(ipoin))
-    buffer_send_faces_vector(3,ipoin) = accel_crust_mantle(3,iboolleft_eta_crust_mantle(ipoin))
-  enddo
-
   do ipoin = 1,npoin2D_eta_inner_core(1)
-    buffer_send_faces_vector(1,ioffset + ipoin) = accel_inner_core(1,iboolleft_eta_inner_core(ipoin))
-    buffer_send_faces_vector(2,ioffset + ipoin) = accel_inner_core(2,iboolleft_eta_inner_core(ipoin))
-    buffer_send_faces_vector(3,ioffset + ipoin) = accel_inner_core(3,iboolleft_eta_inner_core(ipoin))
+    buffer_send_faces_vector(1,ioffset + ipoin,1) = accel_inner_core(1,iboolleft_eta_inner_core(ipoin))
+    buffer_send_faces_vector(2,ioffset + ipoin,1) = accel_inner_core(2,iboolleft_eta_inner_core(ipoin))
+    buffer_send_faces_vector(3,ioffset + ipoin,1) = accel_inner_core(3,iboolleft_eta_inner_core(ipoin))
   enddo
 
 ! send messages backward along each row
@@ -343,37 +401,56 @@
   else
     receiver = addressing(ichunk,iproc_xi,iproc_eta - 1)
   endif
-  call MPI_SENDRECV(buffer_send_faces_vector,NDIM*npoin2D_eta_all(1),CUSTOM_MPI_TYPE,receiver, &
-    itag2,buffer_received_faces_vector,NDIM*npoin2D_eta_all(2),CUSTOM_MPI_TYPE,sender, &
-    itag,MPI_COMM_WORLD,msg_status,ier)
+  call MPI_IRECV(buffer_received_faces_vector,NDIM*npoin2D_eta_all(2),CUSTOM_MPI_TYPE,sender, &
+    itag,MPI_COMM_WORLD,request_receive,ier)
 
+  call MPI_ISSEND(buffer_send_faces_vector,NDIM*npoin2D_eta_all(1),CUSTOM_MPI_TYPE,receiver, &
+    itag2,MPI_COMM_WORLD,request_send,ier)
+
+  iphase = iphase + 1
+  return ! exit because we have started some communications therefore we need some time
+
+  endif !!!!!!!!! end of iphase 4
+
+  if(iphase == 5) then
+
+! call MPI_WAIT(request_send,msg_status,ier)
+! call MPI_WAIT(request_receive,msg_status,ier)
+  call MPI_TEST(request_send,flag_result_test,msg_status,ier)
+  if(.not. flag_result_test) return ! exit if message not sent yet
+  call MPI_TEST(request_receive,flag_result_test,msg_status,ier)
+  if(.not. flag_result_test) return ! exit if message not received yet
+
 ! all slices copy the buffer received to the contributions on the right face
   if(iproc_eta < NPROC_ETA-1) then
 
   do ipoin = 1,npoin2D_eta_crust_mantle(2)
-    accel_crust_mantle(1,iboolright_eta_crust_mantle(ipoin)) = buffer_received_faces_vector(1,ipoin)
-    accel_crust_mantle(2,iboolright_eta_crust_mantle(ipoin)) = buffer_received_faces_vector(2,ipoin)
-    accel_crust_mantle(3,iboolright_eta_crust_mantle(ipoin)) = buffer_received_faces_vector(3,ipoin)
+    accel_crust_mantle(1,iboolright_eta_crust_mantle(ipoin)) = buffer_received_faces_vector(1,ipoin,1)
+    accel_crust_mantle(2,iboolright_eta_crust_mantle(ipoin)) = buffer_received_faces_vector(2,ipoin,1)
+    accel_crust_mantle(3,iboolright_eta_crust_mantle(ipoin)) = buffer_received_faces_vector(3,ipoin,1)
   enddo
 
+! the buffer for the inner core starts right after the buffer for the crust and mantle
   ioffset = npoin2D_eta_crust_mantle(2)
+
   do ipoin = 1,npoin2D_eta_inner_core(2)
-    accel_inner_core(1,iboolright_eta_inner_core(ipoin)) = buffer_received_faces_vector(1,ioffset + ipoin)
-    accel_inner_core(2,iboolright_eta_inner_core(ipoin)) = buffer_received_faces_vector(2,ioffset + ipoin)
-    accel_inner_core(3,iboolright_eta_inner_core(ipoin)) = buffer_received_faces_vector(3,ioffset + ipoin)
+    accel_inner_core(1,iboolright_eta_inner_core(ipoin)) = buffer_received_faces_vector(1,ioffset + ipoin,1)
+    accel_inner_core(2,iboolright_eta_inner_core(ipoin)) = buffer_received_faces_vector(2,ioffset + ipoin,1)
+    accel_inner_core(3,iboolright_eta_inner_core(ipoin)) = buffer_received_faces_vector(3,ioffset + ipoin,1)
   enddo
 
   endif
 
-  endif
-
 !----
 !---- start MPI assembling phase between chunks
 !----
 
 ! check flag to see if we need to assemble (might be turned off when debugging)
 ! and do not assemble if only one chunk
-  if (.not. ACTUALLY_ASSEMBLE_MPI_CHUNKS .or. NCHUNKS == 1) return
+  if (.not. ACTUALLY_ASSEMBLE_MPI_CHUNKS .or. NCHUNKS == 1) then
+    iphase = 9999 ! this means everything is finished
+    return
+  endif
 
 ! ***************************************************************
 !  transmit messages in forward direction (iprocfrom -> iprocto)
@@ -382,43 +459,41 @@
 !---- put slices in receive mode
 !---- a given slice can belong to at most two faces
 
-! use three step scheme that can never deadlock
-! scheme for faces cannot deadlock even if NPROC_XI = NPROC_ETA = 1
-  do imsg_loop = 1,NUM_MSG_TYPES
-
   icount_faces = 0
   do imsg = 1,NUMMSGS_FACES
-  if(myrank==iprocfrom_faces(imsg) .or. &
-       myrank==iprocto_faces(imsg)) icount_faces = icount_faces + 1
-  if(myrank==iprocto_faces(imsg) .and. imsg_type(imsg) == imsg_loop) then
+  if(myrank==iprocfrom_faces(imsg) .or. myrank==iprocto_faces(imsg)) icount_faces = icount_faces + 1
+  if(myrank==iprocto_faces(imsg)) then
     sender = iprocfrom_faces(imsg)
 
 ! size of buffers is the sum of two sizes because we handle two regions in the same MPI call
     npoin2D_chunks_all = npoin2D_faces_crust_mantle(icount_faces) + npoin2D_faces_inner_core(icount_faces)
 
-! the buffer for the inner core starts right after the buffer for the crust and mantle
-    ioffset = npoin2D_faces_crust_mantle(icount_faces)
+    call MPI_IRECV(buffer_received_faces_vector(:,:,icount_faces),NDIM*npoin2D_chunks_all,CUSTOM_MPI_TYPE,sender, &
+              itag,MPI_COMM_WORLD,request_receive_array(icount_faces),ier)
 
-    call MPI_RECV(buffer_received_faces_vector,NDIM*npoin2D_chunks_all,CUSTOM_MPI_TYPE,sender, &
-              itag,MPI_COMM_WORLD,msg_status,ier)
+!   do ipoin2D = 1,npoin2D_faces_crust_mantle(icount_faces)
+!     accel_crust_mantle(1,iboolfaces_crust_mantle(ipoin2D,icount_faces)) = &
+!        accel_crust_mantle(1,iboolfaces_crust_mantle(ipoin2D,icount_faces)) + buffer_received_faces_vector(1,ipoin2D,icount_faces)
+!     accel_crust_mantle(2,iboolfaces_crust_mantle(ipoin2D,icount_faces)) = &
+!        accel_crust_mantle(2,iboolfaces_crust_mantle(ipoin2D,icount_faces)) + buffer_received_faces_vector(2,ipoin2D,icount_faces)
+!     accel_crust_mantle(3,iboolfaces_crust_mantle(ipoin2D,icount_faces)) = &
+!        accel_crust_mantle(3,iboolfaces_crust_mantle(ipoin2D,icount_faces)) + buffer_received_faces_vector(3,ipoin2D,icount_faces)
+!   enddo
 
-    do ipoin2D = 1,npoin2D_faces_crust_mantle(icount_faces)
-      accel_crust_mantle(1,iboolfaces_crust_mantle(ipoin2D,icount_faces)) = &
-         accel_crust_mantle(1,iboolfaces_crust_mantle(ipoin2D,icount_faces)) + buffer_received_faces_vector(1,ipoin2D)
-      accel_crust_mantle(2,iboolfaces_crust_mantle(ipoin2D,icount_faces)) = &
-         accel_crust_mantle(2,iboolfaces_crust_mantle(ipoin2D,icount_faces)) + buffer_received_faces_vector(2,ipoin2D)
-      accel_crust_mantle(3,iboolfaces_crust_mantle(ipoin2D,icount_faces)) = &
-         accel_crust_mantle(3,iboolfaces_crust_mantle(ipoin2D,icount_faces)) + buffer_received_faces_vector(3,ipoin2D)
-    enddo
+! the buffer for the inner core starts right after the buffer for the crust and mantle
+!   ioffset = npoin2D_faces_crust_mantle(icount_faces)
 
-    do ipoin2D = 1,npoin2D_faces_inner_core(icount_faces)
-      accel_inner_core(1,iboolfaces_inner_core(ipoin2D,icount_faces)) = &
-         accel_inner_core(1,iboolfaces_inner_core(ipoin2D,icount_faces)) + buffer_received_faces_vector(1,ioffset + ipoin2D)
-      accel_inner_core(2,iboolfaces_inner_core(ipoin2D,icount_faces)) = &
-         accel_inner_core(2,iboolfaces_inner_core(ipoin2D,icount_faces)) + buffer_received_faces_vector(2,ioffset + ipoin2D)
-      accel_inner_core(3,iboolfaces_inner_core(ipoin2D,icount_faces)) = &
-         accel_inner_core(3,iboolfaces_inner_core(ipoin2D,icount_faces)) + buffer_received_faces_vector(3,ioffset + ipoin2D)
-    enddo
+!   do ipoin2D = 1,npoin2D_faces_inner_core(icount_faces)
+!     accel_inner_core(1,iboolfaces_inner_core(ipoin2D,icount_faces)) = &
+!        accel_inner_core(1,iboolfaces_inner_core(ipoin2D,icount_faces)) + &
+!          buffer_received_faces_vector(1,ioffset + ipoin2D,icount_faces)
+!     accel_inner_core(2,iboolfaces_inner_core(ipoin2D,icount_faces)) = &
+!        accel_inner_core(2,iboolfaces_inner_core(ipoin2D,icount_faces)) + &
+!          buffer_received_faces_vector(2,ioffset + ipoin2D,icount_faces)
+!     accel_inner_core(3,iboolfaces_inner_core(ipoin2D,icount_faces)) = &
+!        accel_inner_core(3,iboolfaces_inner_core(ipoin2D,icount_faces)) + &
+!          buffer_received_faces_vector(3,ioffset + ipoin2D,icount_faces)
+!   enddo
 
   endif
   enddo
@@ -427,35 +502,90 @@
 !---- a given slice can belong to at most two faces
   icount_faces = 0
   do imsg = 1,NUMMSGS_FACES
-  if(myrank==iprocfrom_faces(imsg) .or. &
-       myrank==iprocto_faces(imsg)) icount_faces = icount_faces + 1
-  if(myrank==iprocfrom_faces(imsg) .and. imsg_type(imsg) == imsg_loop) then
+  if(myrank==iprocfrom_faces(imsg) .or. myrank==iprocto_faces(imsg)) icount_faces = icount_faces + 1
+  if(myrank==iprocfrom_faces(imsg)) then
     receiver = iprocto_faces(imsg)
 
 ! size of buffers is the sum of two sizes because we handle two regions in the same MPI call
     npoin2D_chunks_all = npoin2D_faces_crust_mantle(icount_faces) + npoin2D_faces_inner_core(icount_faces)
 
+    do ipoin2D = 1,npoin2D_faces_crust_mantle(icount_faces)
+      buffer_send_faces_vector(1,ipoin2D,icount_faces) = accel_crust_mantle(1,iboolfaces_crust_mantle(ipoin2D,icount_faces))
+      buffer_send_faces_vector(2,ipoin2D,icount_faces) = accel_crust_mantle(2,iboolfaces_crust_mantle(ipoin2D,icount_faces))
+      buffer_send_faces_vector(3,ipoin2D,icount_faces) = accel_crust_mantle(3,iboolfaces_crust_mantle(ipoin2D,icount_faces))
+    enddo
+
 ! the buffer for the inner core starts right after the buffer for the crust and mantle
     ioffset = npoin2D_faces_crust_mantle(icount_faces)
 
+    do ipoin2D = 1,npoin2D_faces_inner_core(icount_faces)
+      buffer_send_faces_vector(1,ioffset + ipoin2D,icount_faces) = accel_inner_core(1,iboolfaces_inner_core(ipoin2D,icount_faces))
+      buffer_send_faces_vector(2,ioffset + ipoin2D,icount_faces) = accel_inner_core(2,iboolfaces_inner_core(ipoin2D,icount_faces))
+      buffer_send_faces_vector(3,ioffset + ipoin2D,icount_faces) = accel_inner_core(3,iboolfaces_inner_core(ipoin2D,icount_faces))
+    enddo
+
+    call MPI_ISSEND(buffer_send_faces_vector(:,:,icount_faces),NDIM*npoin2D_chunks_all,CUSTOM_MPI_TYPE,receiver,itag, &
+                     MPI_COMM_WORLD,request_send_array(icount_faces),ier)
+  endif
+  enddo
+
+  iphase = iphase + 1
+  return ! exit because we have started some communications therefore we need some time
+
+  endif !!!!!!!!! end of iphase 5
+
+  if(iphase == 6) then
+
+  icount_faces = 0
+  do imsg = 1,NUMMSGS_FACES
+  if(myrank==iprocfrom_faces(imsg) .or. myrank==iprocto_faces(imsg)) icount_faces = icount_faces + 1
+  if(myrank==iprocto_faces(imsg)) then
+    call MPI_TEST(request_receive_array(icount_faces),flag_result_test,msg_status,ier)
+    if(.not. flag_result_test) return ! exit if message not received yet
+  endif
+  enddo
+
+  icount_faces = 0
+  do imsg = 1,NUMMSGS_FACES
+  if(myrank==iprocfrom_faces(imsg) .or. myrank==iprocto_faces(imsg)) icount_faces = icount_faces + 1
+  if(myrank==iprocfrom_faces(imsg)) then
+    call MPI_TEST(request_send_array(icount_faces),flag_result_test,msg_status,ier)
+    if(.not. flag_result_test) return ! exit if message not sent yet
+  endif
+  enddo
+
+  icount_faces = 0
+  do imsg = 1,NUMMSGS_FACES
+  if(myrank==iprocfrom_faces(imsg) .or. myrank==iprocto_faces(imsg)) icount_faces = icount_faces + 1
+  if(myrank==iprocto_faces(imsg)) then
+
     do ipoin2D = 1,npoin2D_faces_crust_mantle(icount_faces)
-      buffer_send_faces_vector(1,ipoin2D) = accel_crust_mantle(1,iboolfaces_crust_mantle(ipoin2D,icount_faces))
-      buffer_send_faces_vector(2,ipoin2D) = accel_crust_mantle(2,iboolfaces_crust_mantle(ipoin2D,icount_faces))
-      buffer_send_faces_vector(3,ipoin2D) = accel_crust_mantle(3,iboolfaces_crust_mantle(ipoin2D,icount_faces))
+      accel_crust_mantle(1,iboolfaces_crust_mantle(ipoin2D,icount_faces)) = &
+         accel_crust_mantle(1,iboolfaces_crust_mantle(ipoin2D,icount_faces)) + buffer_received_faces_vector(1,ipoin2D,icount_faces)
+      accel_crust_mantle(2,iboolfaces_crust_mantle(ipoin2D,icount_faces)) = &
+         accel_crust_mantle(2,iboolfaces_crust_mantle(ipoin2D,icount_faces)) + buffer_received_faces_vector(2,ipoin2D,icount_faces)
+      accel_crust_mantle(3,iboolfaces_crust_mantle(ipoin2D,icount_faces)) = &
+         accel_crust_mantle(3,iboolfaces_crust_mantle(ipoin2D,icount_faces)) + buffer_received_faces_vector(3,ipoin2D,icount_faces)
     enddo
 
+! the buffer for the inner core starts right after the buffer for the crust and mantle
+    ioffset = npoin2D_faces_crust_mantle(icount_faces)
+
     do ipoin2D = 1,npoin2D_faces_inner_core(icount_faces)
-      buffer_send_faces_vector(1,ioffset + ipoin2D) = accel_inner_core(1,iboolfaces_inner_core(ipoin2D,icount_faces))
-      buffer_send_faces_vector(2,ioffset + ipoin2D) = accel_inner_core(2,iboolfaces_inner_core(ipoin2D,icount_faces))
-      buffer_send_faces_vector(3,ioffset + ipoin2D) = accel_inner_core(3,iboolfaces_inner_core(ipoin2D,icount_faces))
+      accel_inner_core(1,iboolfaces_inner_core(ipoin2D,icount_faces)) = &
+         accel_inner_core(1,iboolfaces_inner_core(ipoin2D,icount_faces)) + &
+           buffer_received_faces_vector(1,ioffset + ipoin2D,icount_faces)
+      accel_inner_core(2,iboolfaces_inner_core(ipoin2D,icount_faces)) = &
+         accel_inner_core(2,iboolfaces_inner_core(ipoin2D,icount_faces)) + &
+           buffer_received_faces_vector(2,ioffset + ipoin2D,icount_faces)
+      accel_inner_core(3,iboolfaces_inner_core(ipoin2D,icount_faces)) = &
+         accel_inner_core(3,iboolfaces_inner_core(ipoin2D,icount_faces)) + &
+           buffer_received_faces_vector(3,ioffset + ipoin2D,icount_faces)
     enddo
 
-    call MPI_SEND(buffer_send_faces_vector,NDIM*npoin2D_chunks_all,CUSTOM_MPI_TYPE,receiver,itag,MPI_COMM_WORLD,ier)
-
   endif
   enddo
 
-
 ! *********************************************************************
 !  transmit messages back in opposite direction (iprocto -> iprocfrom)
 ! *********************************************************************
@@ -465,31 +595,33 @@
 
   icount_faces = 0
   do imsg = 1,NUMMSGS_FACES
-  if(myrank==iprocfrom_faces(imsg) .or. &
-       myrank==iprocto_faces(imsg)) icount_faces = icount_faces + 1
-  if(myrank==iprocfrom_faces(imsg) .and. imsg_type(imsg) == imsg_loop) then
+  if(myrank==iprocfrom_faces(imsg) .or. myrank==iprocto_faces(imsg)) icount_faces = icount_faces + 1
+  if(myrank==iprocfrom_faces(imsg)) then
     sender = iprocto_faces(imsg)
 
 ! size of buffers is the sum of two sizes because we handle two regions in the same MPI call
     npoin2D_chunks_all = npoin2D_faces_crust_mantle(icount_faces) + npoin2D_faces_inner_core(icount_faces)
 
-! the buffer for the inner core starts right after the buffer for the crust and mantle
-    ioffset = npoin2D_faces_crust_mantle(icount_faces)
+    call MPI_IRECV(buffer_received_faces_vector(:,:,icount_faces),NDIM*npoin2D_chunks_all,CUSTOM_MPI_TYPE,sender, &
+              itag,MPI_COMM_WORLD,request_receive_array(icount_faces),ier)
 
-    call MPI_RECV(buffer_received_faces_vector,NDIM*npoin2D_chunks_all,CUSTOM_MPI_TYPE,sender, &
-              itag,MPI_COMM_WORLD,msg_status,ier)
+!   do ipoin2D = 1,npoin2D_faces_crust_mantle(icount_faces)
+!     accel_crust_mantle(1,iboolfaces_crust_mantle(ipoin2D,icount_faces)) = buffer_received_faces_vector(1,ipoin2D,icount_faces)
+!     accel_crust_mantle(2,iboolfaces_crust_mantle(ipoin2D,icount_faces)) = buffer_received_faces_vector(2,ipoin2D,icount_faces)
+!     accel_crust_mantle(3,iboolfaces_crust_mantle(ipoin2D,icount_faces)) = buffer_received_faces_vector(3,ipoin2D,icount_faces)
+!   enddo
 
-    do ipoin2D = 1,npoin2D_faces_crust_mantle(icount_faces)
-      accel_crust_mantle(1,iboolfaces_crust_mantle(ipoin2D,icount_faces)) = buffer_received_faces_vector(1,ipoin2D)
-      accel_crust_mantle(2,iboolfaces_crust_mantle(ipoin2D,icount_faces)) = buffer_received_faces_vector(2,ipoin2D)
-      accel_crust_mantle(3,iboolfaces_crust_mantle(ipoin2D,icount_faces)) = buffer_received_faces_vector(3,ipoin2D)
-    enddo
+! the buffer for the inner core starts right after the buffer for the crust and mantle
+!   ioffset = npoin2D_faces_crust_mantle(icount_faces)
 
-    do ipoin2D = 1,npoin2D_faces_inner_core(icount_faces)
-      accel_inner_core(1,iboolfaces_inner_core(ipoin2D,icount_faces)) = buffer_received_faces_vector(1,ioffset + ipoin2D)
-      accel_inner_core(2,iboolfaces_inner_core(ipoin2D,icount_faces)) = buffer_received_faces_vector(2,ioffset + ipoin2D)
-      accel_inner_core(3,iboolfaces_inner_core(ipoin2D,icount_faces)) = buffer_received_faces_vector(3,ioffset + ipoin2D)
-    enddo
+!   do ipoin2D = 1,npoin2D_faces_inner_core(icount_faces)
+!     accel_inner_core(1,iboolfaces_inner_core(ipoin2D,icount_faces)) = &
+!       buffer_received_faces_vector(1,ioffset + ipoin2D,icount_faces)
+!     accel_inner_core(2,iboolfaces_inner_core(ipoin2D,icount_faces)) = &
+!       buffer_received_faces_vector(2,ioffset + ipoin2D,icount_faces)
+!     accel_inner_core(3,iboolfaces_inner_core(ipoin2D,icount_faces)) = &
+!       buffer_received_faces_vector(3,ioffset + ipoin2D,icount_faces)
+!   enddo
 
   endif
   enddo
@@ -498,38 +630,87 @@
 !---- a given slice can belong to at most two faces
   icount_faces = 0
   do imsg = 1,NUMMSGS_FACES
-  if(myrank==iprocfrom_faces(imsg) .or. &
-       myrank==iprocto_faces(imsg)) icount_faces = icount_faces + 1
-  if(myrank==iprocto_faces(imsg) .and. imsg_type(imsg) == imsg_loop) then
+  if(myrank==iprocfrom_faces(imsg) .or. myrank==iprocto_faces(imsg)) icount_faces = icount_faces + 1
+  if(myrank==iprocto_faces(imsg)) then
     receiver = iprocfrom_faces(imsg)
 
 ! size of buffers is the sum of two sizes because we handle two regions in the same MPI call
     npoin2D_chunks_all = npoin2D_faces_crust_mantle(icount_faces) + npoin2D_faces_inner_core(icount_faces)
 
+    do ipoin2D = 1,npoin2D_faces_crust_mantle(icount_faces)
+      buffer_send_faces_vector(1,ipoin2D,icount_faces) = accel_crust_mantle(1,iboolfaces_crust_mantle(ipoin2D,icount_faces))
+      buffer_send_faces_vector(2,ipoin2D,icount_faces) = accel_crust_mantle(2,iboolfaces_crust_mantle(ipoin2D,icount_faces))
+      buffer_send_faces_vector(3,ipoin2D,icount_faces) = accel_crust_mantle(3,iboolfaces_crust_mantle(ipoin2D,icount_faces))
+    enddo
+
 ! the buffer for the inner core starts right after the buffer for the crust and mantle
     ioffset = npoin2D_faces_crust_mantle(icount_faces)
 
-    do ipoin2D = 1,npoin2D_faces_crust_mantle(icount_faces)
-      buffer_send_faces_vector(1,ipoin2D) = accel_crust_mantle(1,iboolfaces_crust_mantle(ipoin2D,icount_faces))
-      buffer_send_faces_vector(2,ipoin2D) = accel_crust_mantle(2,iboolfaces_crust_mantle(ipoin2D,icount_faces))
-      buffer_send_faces_vector(3,ipoin2D) = accel_crust_mantle(3,iboolfaces_crust_mantle(ipoin2D,icount_faces))
-    enddo
-
     do ipoin2D = 1,npoin2D_faces_inner_core(icount_faces)
-      buffer_send_faces_vector(1,ioffset + ipoin2D) = accel_inner_core(1,iboolfaces_inner_core(ipoin2D,icount_faces))
-      buffer_send_faces_vector(2,ioffset + ipoin2D) = accel_inner_core(2,iboolfaces_inner_core(ipoin2D,icount_faces))
-      buffer_send_faces_vector(3,ioffset + ipoin2D) = accel_inner_core(3,iboolfaces_inner_core(ipoin2D,icount_faces))
+      buffer_send_faces_vector(1,ioffset + ipoin2D,icount_faces) = accel_inner_core(1,iboolfaces_inner_core(ipoin2D,icount_faces))
+      buffer_send_faces_vector(2,ioffset + ipoin2D,icount_faces) = accel_inner_core(2,iboolfaces_inner_core(ipoin2D,icount_faces))
+      buffer_send_faces_vector(3,ioffset + ipoin2D,icount_faces) = accel_inner_core(3,iboolfaces_inner_core(ipoin2D,icount_faces))
     enddo
 
-    call MPI_SEND(buffer_send_faces_vector,NDIM*npoin2D_chunks_all,CUSTOM_MPI_TYPE,receiver,itag,MPI_COMM_WORLD,ier)
+    call MPI_ISSEND(buffer_send_faces_vector(:,:,icount_faces),NDIM*npoin2D_chunks_all,CUSTOM_MPI_TYPE,receiver,itag, &
+                     MPI_COMM_WORLD,request_send_array(icount_faces),ier)
+  endif
+  enddo
 
+  iphase = iphase + 1
+  return ! exit because we have started some communications therefore we need some time
+
+  endif !!!!!!!!! end of iphase 6
+
+  if(iphase == 7) then
+
+  icount_faces = 0
+  do imsg = 1,NUMMSGS_FACES
+  if(myrank==iprocfrom_faces(imsg) .or. myrank==iprocto_faces(imsg)) icount_faces = icount_faces + 1
+  if(myrank==iprocto_faces(imsg)) then
+    call MPI_TEST(request_send_array(icount_faces),flag_result_test,msg_status,ier)
+    if(.not. flag_result_test) return ! exit if message not received yet
   endif
   enddo
 
-! end of anti-deadlocking loop
+  icount_faces = 0
+  do imsg = 1,NUMMSGS_FACES
+  if(myrank==iprocfrom_faces(imsg) .or. myrank==iprocto_faces(imsg)) icount_faces = icount_faces + 1
+  if(myrank==iprocfrom_faces(imsg)) then
+    call MPI_TEST(request_receive_array(icount_faces),flag_result_test,msg_status,ier)
+    if(.not. flag_result_test) return ! exit if message not sent yet
+  endif
   enddo
 
+  icount_faces = 0
+  do imsg = 1,NUMMSGS_FACES
+  if(myrank==iprocfrom_faces(imsg) .or. myrank==iprocto_faces(imsg)) icount_faces = icount_faces + 1
+  if(myrank==iprocfrom_faces(imsg)) then
+    do ipoin2D = 1,npoin2D_faces_crust_mantle(icount_faces)
+      accel_crust_mantle(1,iboolfaces_crust_mantle(ipoin2D,icount_faces)) = buffer_received_faces_vector(1,ipoin2D,icount_faces)
+      accel_crust_mantle(2,iboolfaces_crust_mantle(ipoin2D,icount_faces)) = buffer_received_faces_vector(2,ipoin2D,icount_faces)
+      accel_crust_mantle(3,iboolfaces_crust_mantle(ipoin2D,icount_faces)) = buffer_received_faces_vector(3,ipoin2D,icount_faces)
+    enddo
 
+! the buffer for the inner core starts right after the buffer for the crust and mantle
+    ioffset = npoin2D_faces_crust_mantle(icount_faces)
+
+    do ipoin2D = 1,npoin2D_faces_inner_core(icount_faces)
+      accel_inner_core(1,iboolfaces_inner_core(ipoin2D,icount_faces)) = &
+        buffer_received_faces_vector(1,ioffset + ipoin2D,icount_faces)
+      accel_inner_core(2,iboolfaces_inner_core(ipoin2D,icount_faces)) = &
+        buffer_received_faces_vector(2,ioffset + ipoin2D,icount_faces)
+      accel_inner_core(3,iboolfaces_inner_core(ipoin2D,icount_faces)) = &
+        buffer_received_faces_vector(3,ioffset + ipoin2D,icount_faces)
+    enddo
+  endif
+  enddo
+
+! this is the exit condition, to go beyond the last phase number
+  iphase = iphase + 1
+
+!! DK DK do the rest in blocking for now, for simplicity
+
 !----
 !---- start MPI assembling corners
 !----
@@ -703,5 +884,7 @@
 
   enddo
 
+  endif !!!!!!!!! end of iphase 7
+
   end subroutine assemble_MPI_vector
 

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/compute_forces_crust_mantle.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/compute_forces_crust_mantle.f90	2011-02-22 01:39:25 UTC (rev 17931)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/compute_forces_crust_mantle.f90	2011-02-22 01:39:52 UTC (rev 17932)
@@ -26,8 +26,26 @@
 !=====================================================================
 
   subroutine compute_forces_crust_mantle(minus_gravity_table,density_table,minus_deriv_gravity_table, &
-          displ,accel,xstore,ystore,zstore, &
+          displ_crust_mantle,accel_crust_mantle,xstore,ystore,zstore, &
           xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
+!----------------------
+            is_on_a_slice_edge_crust_mantle,icall, &
+            accel_inner_core,ibool_inner_core,idoubling_inner_core, &
+            myrank,iproc_xi,iproc_eta,ichunk,addressing, &
+            iboolleft_xi_crust_mantle,iboolright_xi_crust_mantle,iboolleft_eta_crust_mantle,iboolright_eta_crust_mantle, &
+            npoin2D_faces_crust_mantle,npoin2D_xi_crust_mantle,npoin2D_eta_crust_mantle, &
+            iboolfaces_crust_mantle,iboolcorner_crust_mantle, &
+            iboolleft_xi_inner_core,iboolright_xi_inner_core,iboolleft_eta_inner_core,iboolright_eta_inner_core, &
+            npoin2D_faces_inner_core,npoin2D_xi_inner_core,npoin2D_eta_inner_core, &
+            iboolfaces_inner_core,iboolcorner_inner_core, &
+            iprocfrom_faces,iprocto_faces, &
+            iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
+            buffer_send_faces,buffer_received_faces,npoin2D_max_all_CM_IC, &
+            buffer_send_chunkcorners_vector,buffer_recv_chunkcorners_vector,iphase, &
+            nb_msgs_theor_in_cube,sender_from_slices_to_cube, &
+            npoin2D_cube_from_slices,buffer_all_cube_from_slices,buffer_slices,ibool_central_cube, &
+            receiver_cube_from_slices,ibelm_bottom_inner_core,NSPEC2D_BOTTOM_INNER_CORE,INCLUDE_CENTRAL_CUBE,iphase_CC, &
+!----------------------
           hprime_xx,hprime_yy,hprime_zz, &
           hprimewgll_xx,hprimewgll_yy,hprimewgll_zz, &
           wgllwgll_xy,wgllwgll_xz,wgllwgll_yz,wgll_cube, &
@@ -70,7 +88,7 @@
   integer, dimension(NSPEC_CRUST_MANTLE) :: idoubling
 
 ! displacement and acceleration
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_CRUST_MANTLE) :: displ,accel
+  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_CRUST_MANTLE) :: displ_crust_mantle,accel_crust_mantle
 
 ! memory variables for attenuation
 ! memory variables R_ij are stored at the local rather than global level
@@ -175,14 +193,109 @@
   real(kind=CUSTOM_REAL), dimension(NDIM,NGLLX,NGLLY,NGLLZ) :: rho_s_H
   double precision, dimension(NGLLX,NGLLY,NGLLZ) :: wgll_cube
 
+! this for non blocking MPI
+  integer :: iphase,icall
+
+  integer :: computed_elements
+
+  logical, dimension(NSPEC_CRUST_MANTLE) :: is_on_a_slice_edge_crust_mantle
+
+  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_INNER_CORE) :: accel_inner_core
+
+  integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE) :: ibool_inner_core
+
+  integer, dimension(NSPEC_INNER_CORE) :: idoubling_inner_core
+
+  integer :: ichunk,iproc_xi,iproc_eta,myrank
+
+  integer, dimension(NCHUNKS_VAL,0:NPROC_XI_VAL-1,0:NPROC_ETA_VAL-1) :: addressing
+
+  integer, dimension(NGLOB2DMAX_XMIN_XMAX_CM) :: iboolleft_xi_crust_mantle,iboolright_xi_crust_mantle
+  integer, dimension(NGLOB2DMAX_YMIN_YMAX_CM) :: iboolleft_eta_crust_mantle,iboolright_eta_crust_mantle
+
+  integer, dimension(NGLOB2DMAX_XMIN_XMAX_IC) :: iboolleft_xi_inner_core,iboolright_xi_inner_core
+  integer, dimension(NGLOB2DMAX_YMIN_YMAX_IC) :: iboolleft_eta_inner_core,iboolright_eta_inner_core
+
+  integer npoin2D_faces_crust_mantle(NUMFACES_SHARED)
+  integer npoin2D_faces_inner_core(NUMFACES_SHARED)
+
+  integer, dimension(NB_SQUARE_EDGES_ONEDIR) :: npoin2D_xi_crust_mantle,npoin2D_eta_crust_mantle, &
+       npoin2D_xi_inner_core,npoin2D_eta_inner_core
+
+! communication pattern for faces between chunks
+  integer, dimension(NUMMSGS_FACES_VAL) :: iprocfrom_faces,iprocto_faces
+
+! communication pattern for corners between chunks
+  integer, dimension(NCORNERSCHUNKS_VAL) :: iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners
+
+  integer, dimension(NGLOB1D_RADIAL_CM,NUMCORNERS_SHARED) :: iboolcorner_crust_mantle
+  integer, dimension(NGLOB1D_RADIAL_IC,NUMCORNERS_SHARED) :: iboolcorner_inner_core
+
+  integer, dimension(NGLOB2DMAX_XY_VAL,NUMFACES_SHARED) :: iboolfaces_crust_mantle
+  integer, dimension(NGLOB2DMAX_XY_VAL,NUMFACES_SHARED) :: iboolfaces_inner_core
+
+  integer :: npoin2D_max_all_CM_IC
+  real(kind=CUSTOM_REAL), dimension(NDIM,npoin2D_max_all_CM_IC) :: buffer_send_faces,buffer_received_faces
+
+! size of buffers is the sum of two sizes because we handle two regions in the same MPI call
+  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB1D_RADIAL_CM + NGLOB1D_RADIAL_IC) :: &
+     buffer_send_chunkcorners_vector,buffer_recv_chunkcorners_vector
+
+! for matching with central cube in inner core
+  integer nb_msgs_theor_in_cube, npoin2D_cube_from_slices,iphase_CC
+  integer, dimension(nb_msgs_theor_in_cube) :: sender_from_slices_to_cube
+  double precision, dimension(npoin2D_cube_from_slices,NDIM) :: buffer_slices
+  double precision, dimension(npoin2D_cube_from_slices,NDIM,nb_msgs_theor_in_cube) :: buffer_all_cube_from_slices
+  integer, dimension(nb_msgs_theor_in_cube,npoin2D_cube_from_slices):: ibool_central_cube
+  integer receiver_cube_from_slices
+  logical :: INCLUDE_CENTRAL_CUBE
+
+! local to global mapping
+  integer NSPEC2D_BOTTOM_INNER_CORE
+  integer, dimension(NSPEC2D_BOTTOM_INNER_CORE) :: ibelm_bottom_inner_core
+
 ! ****************************************************
 !   big loop over all spectral elements in the solid
 ! ****************************************************
 
-! set acceleration to zero
-!  accel(:,:) = 0._CUSTOM_REAL
+  computed_elements = 0
 
   do ispec = 1,NSPEC_CRUST_MANTLE
+
+! hide communications by computing the edges first
+    if((icall == 2 .and. is_on_a_slice_edge_crust_mantle(ispec)) .or. &
+       (icall == 1 .and. .not. is_on_a_slice_edge_crust_mantle(ispec))) cycle
+
+! process the communications every ELEMENTS_NONBLOCKING elements
+    computed_elements = computed_elements + 1
+    if (USE_NONBLOCKING_COMMS .and. icall == 2 .and. mod(computed_elements,ELEMENTS_NONBLOCKING_CM_IC) == 0) then
+
+      if(iphase <= 7) call assemble_MPI_vector(myrank,accel_crust_mantle,accel_inner_core, &
+            iproc_xi,iproc_eta,ichunk,addressing, &
+            iboolleft_xi_crust_mantle,iboolright_xi_crust_mantle,iboolleft_eta_crust_mantle,iboolright_eta_crust_mantle, &
+            npoin2D_faces_crust_mantle,npoin2D_xi_crust_mantle,npoin2D_eta_crust_mantle, &
+            iboolfaces_crust_mantle,iboolcorner_crust_mantle, &
+            iboolleft_xi_inner_core,iboolright_xi_inner_core,iboolleft_eta_inner_core,iboolright_eta_inner_core, &
+            npoin2D_faces_inner_core,npoin2D_xi_inner_core,npoin2D_eta_inner_core, &
+            iboolfaces_inner_core,iboolcorner_inner_core, &
+            iprocfrom_faces,iprocto_faces, &
+            iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
+            buffer_send_faces,buffer_received_faces,npoin2D_max_all_CM_IC, &
+            buffer_send_chunkcorners_vector,buffer_recv_chunkcorners_vector, &
+            NUMMSGS_FACES_VAL,NCORNERSCHUNKS_VAL, &
+            NPROC_XI_VAL,NPROC_ETA_VAL,NGLOB1D_RADIAL_CM, &
+            NGLOB1D_RADIAL_IC,NCHUNKS_VAL,iphase)
+
+      if(INCLUDE_CENTRAL_CUBE) then
+          if(iphase > 7 .and. iphase_CC <= 4) &
+            call assemble_MPI_central_cube(ichunk,nb_msgs_theor_in_cube,sender_from_slices_to_cube, &
+                   npoin2D_cube_from_slices,buffer_all_cube_from_slices,buffer_slices,ibool_central_cube, &
+                   receiver_cube_from_slices,ibool_inner_core,idoubling_inner_core, &
+                   ibelm_bottom_inner_core,NSPEC2D_BOTTOM_IC,accel_inner_core,NDIM,iphase_CC)
+      endif
+
+    endif
+
     do k=1,NGLLZ
       do j=1,NGLLY
         do i=1,NGLLX
@@ -202,25 +315,25 @@
           do l=1,NGLLX
             hp1 = hprime_xx(i,l)
             iglob = ibool(l,j,k,ispec)
-            tempx1l = tempx1l + displ(1,iglob)*hp1
-            tempy1l = tempy1l + displ(2,iglob)*hp1
-            tempz1l = tempz1l + displ(3,iglob)*hp1
+            tempx1l = tempx1l + displ_crust_mantle(1,iglob)*hp1
+            tempy1l = tempy1l + displ_crust_mantle(2,iglob)*hp1
+            tempz1l = tempz1l + displ_crust_mantle(3,iglob)*hp1
 !!! can merge these loops because NGLLX = NGLLY = NGLLZ          enddo
 
 !!! can merge these loops because NGLLX = NGLLY = NGLLZ          do l=1,NGLLY
             hp2 = hprime_yy(j,l)
             iglob = ibool(i,l,k,ispec)
-            tempx2l = tempx2l + displ(1,iglob)*hp2
-            tempy2l = tempy2l + displ(2,iglob)*hp2
-            tempz2l = tempz2l + displ(3,iglob)*hp2
+            tempx2l = tempx2l + displ_crust_mantle(1,iglob)*hp2
+            tempy2l = tempy2l + displ_crust_mantle(2,iglob)*hp2
+            tempz2l = tempz2l + displ_crust_mantle(3,iglob)*hp2
 !!! can merge these loops because NGLLX = NGLLY = NGLLZ          enddo
 
 !!! can merge these loops because NGLLX = NGLLY = NGLLZ          do l=1,NGLLZ
             hp3 = hprime_zz(k,l)
             iglob = ibool(i,j,l,ispec)
-            tempx3l = tempx3l + displ(1,iglob)*hp3
-            tempy3l = tempy3l + displ(2,iglob)*hp3
-            tempz3l = tempz3l + displ(3,iglob)*hp3
+            tempx3l = tempx3l + displ_crust_mantle(1,iglob)*hp3
+            tempy3l = tempy3l + displ_crust_mantle(2,iglob)*hp3
+            tempz3l = tempz3l + displ_crust_mantle(3,iglob)*hp3
           enddo
 
 !         get derivatives of ux, uy and uz with respect to x, y and z
@@ -646,9 +759,9 @@
             if(CUSTOM_REAL == SIZE_REAL) then
 
         ! get displacement and multiply by density to compute G tensor
-              sx_l = rho * dble(displ(1,iglob))
-              sy_l = rho * dble(displ(2,iglob))
-              sz_l = rho * dble(displ(3,iglob))
+              sx_l = rho * dble(displ_crust_mantle(1,iglob))
+              sy_l = rho * dble(displ_crust_mantle(2,iglob))
+              sz_l = rho * dble(displ_crust_mantle(3,iglob))
 
         ! compute G tensor from s . g and add to sigma (not symmetric)
               sigma_xx = sigma_xx + sngl(sy_l*gyl + sz_l*gzl)
@@ -673,9 +786,9 @@
             else
 
         ! get displacement and multiply by density to compute G tensor
-              sx_l = rho * displ(1,iglob)
-              sy_l = rho * displ(2,iglob)
-              sz_l = rho * displ(3,iglob)
+              sx_l = rho * displ_crust_mantle(1,iglob)
+              sy_l = rho * displ_crust_mantle(2,iglob)
+              sz_l = rho * displ_crust_mantle(3,iglob)
 
         ! compute G tensor from s . g and add to sigma (not symmetric)
               sigma_xx = sigma_xx + sy_l*gyl + sz_l*gzl
@@ -774,9 +887,9 @@
       do j=1,NGLLY
         do i=1,NGLLX
           iglob = ibool(i,j,k,ispec)
-          accel(1,iglob) = accel(1,iglob) + sum_terms(1,i,j,k)
-          accel(2,iglob) = accel(2,iglob) + sum_terms(2,i,j,k)
-          accel(3,iglob) = accel(3,iglob) + sum_terms(3,i,j,k)
+          accel_crust_mantle(1,iglob) = accel_crust_mantle(1,iglob) + sum_terms(1,i,j,k)
+          accel_crust_mantle(2,iglob) = accel_crust_mantle(2,iglob) + sum_terms(2,i,j,k)
+          accel_crust_mantle(3,iglob) = accel_crust_mantle(3,iglob) + sum_terms(3,i,j,k)
         enddo
       enddo
     enddo

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/compute_forces_crust_mantle_Dev.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/compute_forces_crust_mantle_Dev.f90	2011-02-22 01:39:25 UTC (rev 17931)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/compute_forces_crust_mantle_Dev.f90	2011-02-22 01:39:52 UTC (rev 17932)
@@ -25,10 +25,27 @@
 !
 !=====================================================================
 
-
   subroutine compute_forces_crust_mantle_Dev(minus_gravity_table,density_table,minus_deriv_gravity_table, &
-          displ,accel,xstore,ystore,zstore, &
+          displ_crust_mantle,accel_crust_mantle,xstore,ystore,zstore, &
           xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
+!----------------------
+            is_on_a_slice_edge_crust_mantle,icall, &
+            accel_inner_core,ibool_inner_core,idoubling_inner_core, &
+            myrank,iproc_xi,iproc_eta,ichunk,addressing, &
+            iboolleft_xi_crust_mantle,iboolright_xi_crust_mantle,iboolleft_eta_crust_mantle,iboolright_eta_crust_mantle, &
+            npoin2D_faces_crust_mantle,npoin2D_xi_crust_mantle,npoin2D_eta_crust_mantle, &
+            iboolfaces_crust_mantle,iboolcorner_crust_mantle, &
+            iboolleft_xi_inner_core,iboolright_xi_inner_core,iboolleft_eta_inner_core,iboolright_eta_inner_core, &
+            npoin2D_faces_inner_core,npoin2D_xi_inner_core,npoin2D_eta_inner_core, &
+            iboolfaces_inner_core,iboolcorner_inner_core, &
+            iprocfrom_faces,iprocto_faces, &
+            iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
+            buffer_send_faces,buffer_received_faces,npoin2D_max_all_CM_IC, &
+            buffer_send_chunkcorners_vector,buffer_recv_chunkcorners_vector,iphase, &
+            nb_msgs_theor_in_cube,sender_from_slices_to_cube, &
+            npoin2D_cube_from_slices,buffer_all_cube_from_slices,buffer_slices,ibool_central_cube, &
+            receiver_cube_from_slices,ibelm_bottom_inner_core,NSPEC2D_BOTTOM_INNER_CORE,INCLUDE_CENTRAL_CUBE,iphase_CC, &
+!----------------------
           hprime_xx,hprime_xxT, &
           hprimewgll_xx,hprimewgll_xxT, &
           wgllwgll_xy,wgllwgll_xz,wgllwgll_yz,wgll_cube, &
@@ -39,9 +56,8 @@
           ibool,idoubling,R_memory,epsilondev,epsilon_trace_over_3,one_minus_sum_beta, &
           alphaval,betaval,gammaval,factor_common,vx,vy,vz,vnspec)
 
+! this routine is optimized for NGLLX = NGLLY = NGLLZ = 5 using the Deville et al. (2002) inlined matrix-matrix products
 
-! this routine is optimized for NGLLX = NGLLY = NGLLZ = 5
-
   implicit none
 
   include "constants.h"
@@ -51,7 +67,7 @@
   include "OUTPUT_FILES/values_from_mesher.h"
 
   ! displacement and acceleration
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_CRUST_MANTLE) :: displ,accel
+  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_CRUST_MANTLE) :: displ_crust_mantle,accel_crust_mantle
   ! arrays with mesh parameters per slice
   integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE) :: ibool
 
@@ -182,22 +198,117 @@
   real(kind=CUSTOM_REAL), dimension(NDIM,NGLLX,NGLLY,NGLLZ) :: rho_s_H
   real(kind=CUSTOM_REAL) sigma_yx,sigma_zx,sigma_zy
 
-  !real(kind=CUSTOM_REAL), dimension(5) :: dummy
-
   integer :: i_SLS,i_memory,imodulo_N_SLS
   integer :: ispec,ispec_strain
   integer :: i,j,k
   integer :: int_radius
   integer :: iglob1,iglob2,iglob3,iglob4,iglob5
 
+! this for non blocking MPI
+  integer :: iphase,icall
+
+  integer :: computed_elements
+
+  logical, dimension(NSPEC_CRUST_MANTLE) :: is_on_a_slice_edge_crust_mantle
+
+  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_INNER_CORE) :: accel_inner_core
+
+  integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE) :: ibool_inner_core
+
+  integer, dimension(NSPEC_INNER_CORE) :: idoubling_inner_core
+
+  integer :: ichunk,iproc_xi,iproc_eta,myrank
+
+  integer, dimension(NCHUNKS_VAL,0:NPROC_XI_VAL-1,0:NPROC_ETA_VAL-1) :: addressing
+
+  integer, dimension(NGLOB2DMAX_XMIN_XMAX_CM) :: iboolleft_xi_crust_mantle,iboolright_xi_crust_mantle
+  integer, dimension(NGLOB2DMAX_YMIN_YMAX_CM) :: iboolleft_eta_crust_mantle,iboolright_eta_crust_mantle
+
+  integer, dimension(NGLOB2DMAX_XMIN_XMAX_IC) :: iboolleft_xi_inner_core,iboolright_xi_inner_core
+  integer, dimension(NGLOB2DMAX_YMIN_YMAX_IC) :: iboolleft_eta_inner_core,iboolright_eta_inner_core
+
+  integer npoin2D_faces_crust_mantle(NUMFACES_SHARED)
+  integer npoin2D_faces_inner_core(NUMFACES_SHARED)
+
+  integer, dimension(NB_SQUARE_EDGES_ONEDIR) :: npoin2D_xi_crust_mantle,npoin2D_eta_crust_mantle, &
+       npoin2D_xi_inner_core,npoin2D_eta_inner_core
+
+! communication pattern for faces between chunks
+  integer, dimension(NUMMSGS_FACES_VAL) :: iprocfrom_faces,iprocto_faces
+
+! communication pattern for corners between chunks
+  integer, dimension(NCORNERSCHUNKS_VAL) :: iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners
+
+  integer, dimension(NGLOB1D_RADIAL_CM,NUMCORNERS_SHARED) :: iboolcorner_crust_mantle
+  integer, dimension(NGLOB1D_RADIAL_IC,NUMCORNERS_SHARED) :: iboolcorner_inner_core
+
+  integer, dimension(NGLOB2DMAX_XY_VAL,NUMFACES_SHARED) :: iboolfaces_crust_mantle
+  integer, dimension(NGLOB2DMAX_XY_VAL,NUMFACES_SHARED) :: iboolfaces_inner_core
+
+  integer :: npoin2D_max_all_CM_IC
+  real(kind=CUSTOM_REAL), dimension(NDIM,npoin2D_max_all_CM_IC) :: buffer_send_faces,buffer_received_faces
+
+! size of buffers is the sum of two sizes because we handle two regions in the same MPI call
+  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB1D_RADIAL_CM + NGLOB1D_RADIAL_IC) :: &
+     buffer_send_chunkcorners_vector,buffer_recv_chunkcorners_vector
+
+! for matching with central cube in inner core
+  integer nb_msgs_theor_in_cube, npoin2D_cube_from_slices,iphase_CC
+  integer, dimension(nb_msgs_theor_in_cube) :: sender_from_slices_to_cube
+  double precision, dimension(npoin2D_cube_from_slices,NDIM) :: buffer_slices
+  double precision, dimension(npoin2D_cube_from_slices,NDIM,nb_msgs_theor_in_cube) :: buffer_all_cube_from_slices
+  integer, dimension(nb_msgs_theor_in_cube,npoin2D_cube_from_slices):: ibool_central_cube
+  integer receiver_cube_from_slices
+  logical :: INCLUDE_CENTRAL_CUBE
+
+! local to global mapping
+  integer NSPEC2D_BOTTOM_INNER_CORE
+  integer, dimension(NSPEC2D_BOTTOM_INNER_CORE) :: ibelm_bottom_inner_core
+
 ! ****************************************************
 !   big loop over all spectral elements in the solid
 ! ****************************************************
 
   imodulo_N_SLS = mod(N_SLS,3)
 
+  computed_elements = 0
+
   do ispec = 1,NSPEC_CRUST_MANTLE
 
+! hide communications by computing the edges first
+    if((icall == 2 .and. is_on_a_slice_edge_crust_mantle(ispec)) .or. &
+       (icall == 1 .and. .not. is_on_a_slice_edge_crust_mantle(ispec))) cycle
+
+! process the communications every ELEMENTS_NONBLOCKING elements
+    computed_elements = computed_elements + 1
+    if (USE_NONBLOCKING_COMMS .and. icall == 2 .and. mod(computed_elements,ELEMENTS_NONBLOCKING_CM_IC) == 0) then
+
+      if(iphase <= 7) call assemble_MPI_vector(myrank,accel_crust_mantle,accel_inner_core, &
+            iproc_xi,iproc_eta,ichunk,addressing, &
+            iboolleft_xi_crust_mantle,iboolright_xi_crust_mantle,iboolleft_eta_crust_mantle,iboolright_eta_crust_mantle, &
+            npoin2D_faces_crust_mantle,npoin2D_xi_crust_mantle,npoin2D_eta_crust_mantle, &
+            iboolfaces_crust_mantle,iboolcorner_crust_mantle, &
+            iboolleft_xi_inner_core,iboolright_xi_inner_core,iboolleft_eta_inner_core,iboolright_eta_inner_core, &
+            npoin2D_faces_inner_core,npoin2D_xi_inner_core,npoin2D_eta_inner_core, &
+            iboolfaces_inner_core,iboolcorner_inner_core, &
+            iprocfrom_faces,iprocto_faces, &
+            iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
+            buffer_send_faces,buffer_received_faces,npoin2D_max_all_CM_IC, &
+            buffer_send_chunkcorners_vector,buffer_recv_chunkcorners_vector, &
+            NUMMSGS_FACES_VAL,NCORNERSCHUNKS_VAL, &
+            NPROC_XI_VAL,NPROC_ETA_VAL,NGLOB1D_RADIAL_CM, &
+            NGLOB1D_RADIAL_IC,NCHUNKS_VAL,iphase)
+
+      if(INCLUDE_CENTRAL_CUBE) then
+          if(iphase > 7 .and. iphase_CC <= 4) &
+            call assemble_MPI_central_cube(ichunk,nb_msgs_theor_in_cube,sender_from_slices_to_cube, &
+                   npoin2D_cube_from_slices,buffer_all_cube_from_slices,buffer_slices,ibool_central_cube, &
+                   receiver_cube_from_slices,ibool_inner_core,idoubling_inner_core, &
+                   ibelm_bottom_inner_core,NSPEC2D_BOTTOM_IC,accel_inner_core,NDIM,iphase_CC)
+      endif
+
+    endif
+
     ! subroutines adapted from Deville, Fischer and Mund, High-order methods
     ! for incompressible fluid flow, Cambridge University Press (2002),
     ! pages 386 and 389 and Figure 8.3.1
@@ -207,9 +318,9 @@
 ! way 1:
 !        do i=1,NGLLX
 !            iglob = ibool(i,j,k,ispec)
-!            dummyx_loc(i,j,k) = displ(1,iglob)
-!            dummyy_loc(i,j,k) = displ(2,iglob)
-!            dummyz_loc(i,j,k) = displ(3,iglob)
+!            dummyx_loc(i,j,k) = displ_crust_mantle(1,iglob)
+!            dummyy_loc(i,j,k) = displ_crust_mantle(2,iglob)
+!            dummyz_loc(i,j,k) = displ_crust_mantle(3,iglob)
 !        enddo
 
 ! way 2:
@@ -220,25 +331,25 @@
         iglob4 = ibool(4,j,k,ispec)
         iglob5 = ibool(5,j,k,ispec)
 
-        dummyx_loc(1,j,k) = displ(1,iglob1)
-        dummyy_loc(1,j,k) = displ(2,iglob1)
-        dummyz_loc(1,j,k) = displ(3,iglob1)
+        dummyx_loc(1,j,k) = displ_crust_mantle(1,iglob1)
+        dummyy_loc(1,j,k) = displ_crust_mantle(2,iglob1)
+        dummyz_loc(1,j,k) = displ_crust_mantle(3,iglob1)
 
-        dummyx_loc(2,j,k) = displ(1,iglob2)
-        dummyy_loc(2,j,k) = displ(2,iglob2)
-        dummyz_loc(2,j,k) = displ(3,iglob2)
+        dummyx_loc(2,j,k) = displ_crust_mantle(1,iglob2)
+        dummyy_loc(2,j,k) = displ_crust_mantle(2,iglob2)
+        dummyz_loc(2,j,k) = displ_crust_mantle(3,iglob2)
 
-        dummyx_loc(3,j,k) = displ(1,iglob3)
-        dummyy_loc(3,j,k) = displ(2,iglob3)
-        dummyz_loc(3,j,k) = displ(3,iglob3)
+        dummyx_loc(3,j,k) = displ_crust_mantle(1,iglob3)
+        dummyy_loc(3,j,k) = displ_crust_mantle(2,iglob3)
+        dummyz_loc(3,j,k) = displ_crust_mantle(3,iglob3)
 
-        dummyx_loc(4,j,k) = displ(1,iglob4)
-        dummyy_loc(4,j,k) = displ(2,iglob4)
-        dummyz_loc(4,j,k) = displ(3,iglob4)
+        dummyx_loc(4,j,k) = displ_crust_mantle(1,iglob4)
+        dummyy_loc(4,j,k) = displ_crust_mantle(2,iglob4)
+        dummyz_loc(4,j,k) = displ_crust_mantle(3,iglob4)
 
-        dummyx_loc(5,j,k) = displ(1,iglob5)
-        dummyy_loc(5,j,k) = displ(2,iglob5)
-        dummyz_loc(5,j,k) = displ(3,iglob5)
+        dummyx_loc(5,j,k) = displ_crust_mantle(1,iglob5)
+        dummyy_loc(5,j,k) = displ_crust_mantle(2,iglob5)
+        dummyz_loc(5,j,k) = displ_crust_mantle(3,iglob5)
 
       enddo
     enddo
@@ -792,9 +903,9 @@
             if(CUSTOM_REAL == SIZE_REAL) then
 
               ! get displacement and multiply by density to compute G tensor
-              sx_l = rho * dble(displ(1,iglob1))
-              sy_l = rho * dble(displ(2,iglob1))
-              sz_l = rho * dble(displ(3,iglob1))
+              sx_l = rho * dble(displ_crust_mantle(1,iglob1))
+              sy_l = rho * dble(displ_crust_mantle(2,iglob1))
+              sz_l = rho * dble(displ_crust_mantle(3,iglob1))
 
               ! compute G tensor from s . g and add to sigma (not symmetric)
               sigma_xx = sigma_xx + sngl(sy_l*gyl + sz_l*gzl)
@@ -819,9 +930,9 @@
             else
 
               ! get displacement and multiply by density to compute G tensor
-              sx_l = rho * displ(1,iglob1)
-              sy_l = rho * displ(2,iglob1)
-              sz_l = rho * displ(3,iglob1)
+              sx_l = rho * displ_crust_mantle(1,iglob1)
+              sy_l = rho * displ_crust_mantle(2,iglob1)
+              sz_l = rho * displ_crust_mantle(3,iglob1)
 
               ! compute G tensor from s . g and add to sigma (not symmetric)
               sigma_xx = sigma_xx + sy_l*gyl + sz_l*gzl
@@ -961,15 +1072,15 @@
 ! way 1:
 !        do i=1,NGLLX
 !          iglob = ibool(i,j,k,ispec)
-!          accel(:,iglob) = accel(:,iglob) + sum_terms(:,i,j,k)
+!          accel_crust_mantle(:,iglob) = accel_crust_mantle(:,iglob) + sum_terms(:,i,j,k)
 !        enddo
 
 ! way 2:
-        accel(:,ibool(1,j,k,ispec)) = accel(:,ibool(1,j,k,ispec)) + sum_terms(:,1,j,k)
-        accel(:,ibool(2,j,k,ispec)) = accel(:,ibool(2,j,k,ispec)) + sum_terms(:,2,j,k)
-        accel(:,ibool(3,j,k,ispec)) = accel(:,ibool(3,j,k,ispec)) + sum_terms(:,3,j,k)
-        accel(:,ibool(4,j,k,ispec)) = accel(:,ibool(4,j,k,ispec)) + sum_terms(:,4,j,k)
-        accel(:,ibool(5,j,k,ispec)) = accel(:,ibool(5,j,k,ispec)) + sum_terms(:,5,j,k)
+        accel_crust_mantle(:,ibool(1,j,k,ispec)) = accel_crust_mantle(:,ibool(1,j,k,ispec)) + sum_terms(:,1,j,k)
+        accel_crust_mantle(:,ibool(2,j,k,ispec)) = accel_crust_mantle(:,ibool(2,j,k,ispec)) + sum_terms(:,2,j,k)
+        accel_crust_mantle(:,ibool(3,j,k,ispec)) = accel_crust_mantle(:,ibool(3,j,k,ispec)) + sum_terms(:,3,j,k)
+        accel_crust_mantle(:,ibool(4,j,k,ispec)) = accel_crust_mantle(:,ibool(4,j,k,ispec)) + sum_terms(:,4,j,k)
+        accel_crust_mantle(:,ibool(5,j,k,ispec)) = accel_crust_mantle(:,ibool(5,j,k,ispec)) + sum_terms(:,5,j,k)
 
       enddo
     enddo

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/compute_forces_inner_core.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/compute_forces_inner_core.f90	2011-02-22 01:39:25 UTC (rev 17931)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/compute_forces_inner_core.f90	2011-02-22 01:39:52 UTC (rev 17932)
@@ -26,8 +26,26 @@
 !=====================================================================
 
   subroutine compute_forces_inner_core(minus_gravity_table,density_table,minus_deriv_gravity_table, &
-          displ,accel,xstore,ystore,zstore, &
+          displ_inner_core,accel_inner_core,xstore,ystore,zstore, &
           xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
+!----------------------
+            is_on_a_slice_edge_inner_core,icall, &
+            accel_crust_mantle,ibool_inner_core,idoubling_inner_core, &
+            myrank,iproc_xi,iproc_eta,ichunk,addressing, &
+            iboolleft_xi_crust_mantle,iboolright_xi_crust_mantle,iboolleft_eta_crust_mantle,iboolright_eta_crust_mantle, &
+            npoin2D_faces_crust_mantle,npoin2D_xi_crust_mantle,npoin2D_eta_crust_mantle, &
+            iboolfaces_crust_mantle,iboolcorner_crust_mantle, &
+            iboolleft_xi_inner_core,iboolright_xi_inner_core,iboolleft_eta_inner_core,iboolright_eta_inner_core, &
+            npoin2D_faces_inner_core,npoin2D_xi_inner_core,npoin2D_eta_inner_core, &
+            iboolfaces_inner_core,iboolcorner_inner_core, &
+            iprocfrom_faces,iprocto_faces, &
+            iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
+            buffer_send_faces,buffer_received_faces,npoin2D_max_all_CM_IC, &
+            buffer_send_chunkcorners_vector,buffer_recv_chunkcorners_vector,iphase, &
+            nb_msgs_theor_in_cube,sender_from_slices_to_cube, &
+            npoin2D_cube_from_slices,buffer_all_cube_from_slices,buffer_slices,ibool_central_cube, &
+            receiver_cube_from_slices,ibelm_bottom_inner_core,NSPEC2D_BOTTOM_INNER_CORE,INCLUDE_CENTRAL_CUBE,iphase_CC, &
+!----------------------
           hprime_xx,hprime_yy,hprime_zz, &
           hprimewgll_xx,hprimewgll_yy,hprimewgll_zz, &
           wgllwgll_xy,wgllwgll_xz,wgllwgll_yz,wgll_cube, &
@@ -45,7 +63,7 @@
   include "OUTPUT_FILES/values_from_mesher.h"
 
 ! displacement and acceleration
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_INNER_CORE) :: displ,accel
+  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_INNER_CORE) :: displ_inner_core,accel_inner_core
 
 ! for attenuation
 ! memory variables R_ij are stored at the local rather than global level
@@ -131,15 +149,112 @@
   double precision, dimension(NGLLX,NGLLY,NGLLZ) :: wgll_cube
   real(kind=CUSTOM_REAL), dimension(NGLOB_INNER_CORE) :: xstore,ystore,zstore
 
+! this for non blocking MPI
+  integer :: iphase,icall
+
+  integer :: computed_elements
+
+  logical, dimension(NSPEC_INNER_CORE) :: is_on_a_slice_edge_inner_core
+
+  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_CRUST_MANTLE) :: accel_crust_mantle
+
+  integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE) :: ibool_inner_core
+
+  integer, dimension(NSPEC_INNER_CORE) :: idoubling_inner_core
+
+  integer :: ichunk,iproc_xi,iproc_eta,myrank
+
+  integer, dimension(NCHUNKS_VAL,0:NPROC_XI_VAL-1,0:NPROC_ETA_VAL-1) :: addressing
+
+  integer, dimension(NGLOB2DMAX_XMIN_XMAX_CM) :: iboolleft_xi_crust_mantle,iboolright_xi_crust_mantle
+  integer, dimension(NGLOB2DMAX_YMIN_YMAX_CM) :: iboolleft_eta_crust_mantle,iboolright_eta_crust_mantle
+
+  integer, dimension(NGLOB2DMAX_XMIN_XMAX_IC) :: iboolleft_xi_inner_core,iboolright_xi_inner_core
+  integer, dimension(NGLOB2DMAX_YMIN_YMAX_IC) :: iboolleft_eta_inner_core,iboolright_eta_inner_core
+
+  integer npoin2D_faces_crust_mantle(NUMFACES_SHARED)
+  integer npoin2D_faces_inner_core(NUMFACES_SHARED)
+
+  integer, dimension(NB_SQUARE_EDGES_ONEDIR) :: npoin2D_xi_crust_mantle,npoin2D_eta_crust_mantle, &
+       npoin2D_xi_inner_core,npoin2D_eta_inner_core
+
+! communication pattern for faces between chunks
+  integer, dimension(NUMMSGS_FACES_VAL) :: iprocfrom_faces,iprocto_faces
+
+! communication pattern for corners between chunks
+  integer, dimension(NCORNERSCHUNKS_VAL) :: iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners
+
+  integer, dimension(NGLOB1D_RADIAL_CM,NUMCORNERS_SHARED) :: iboolcorner_crust_mantle
+  integer, dimension(NGLOB1D_RADIAL_IC,NUMCORNERS_SHARED) :: iboolcorner_inner_core
+
+  integer, dimension(NGLOB2DMAX_XY_VAL,NUMFACES_SHARED) :: iboolfaces_crust_mantle
+  integer, dimension(NGLOB2DMAX_XY_VAL,NUMFACES_SHARED) :: iboolfaces_inner_core
+
+  integer :: npoin2D_max_all_CM_IC
+  real(kind=CUSTOM_REAL), dimension(NDIM,npoin2D_max_all_CM_IC) :: buffer_send_faces,buffer_received_faces
+
+! size of buffers is the sum of two sizes because we handle two regions in the same MPI call
+  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB1D_RADIAL_CM + NGLOB1D_RADIAL_IC) :: &
+     buffer_send_chunkcorners_vector,buffer_recv_chunkcorners_vector
+
+! for matching with central cube in inner core
+  integer nb_msgs_theor_in_cube, npoin2D_cube_from_slices,iphase_CC
+  integer, dimension(nb_msgs_theor_in_cube) :: sender_from_slices_to_cube
+  double precision, dimension(npoin2D_cube_from_slices,NDIM) :: buffer_slices
+  double precision, dimension(npoin2D_cube_from_slices,NDIM,nb_msgs_theor_in_cube) :: buffer_all_cube_from_slices
+  integer, dimension(nb_msgs_theor_in_cube,npoin2D_cube_from_slices):: ibool_central_cube
+  integer receiver_cube_from_slices
+  logical :: INCLUDE_CENTRAL_CUBE
+
+! local to global mapping
+  integer NSPEC2D_BOTTOM_INNER_CORE
+  integer, dimension(NSPEC2D_BOTTOM_INNER_CORE) :: ibelm_bottom_inner_core
+
 ! ****************************************************
 !   big loop over all spectral elements in the solid
 ! ****************************************************
 
+  computed_elements = 0
+
   do ispec = 1,NSPEC_INNER_CORE
 
+! hide communications by computing the edges first
+    if((icall == 2 .and. is_on_a_slice_edge_inner_core(ispec)) .or. &
+       (icall == 1 .and. .not. is_on_a_slice_edge_inner_core(ispec))) cycle
+
 ! exclude fictitious elements in central cube
     if(idoubling(ispec) /= IFLAG_IN_FICTITIOUS_CUBE) then
 
+! process the communications every ELEMENTS_NONBLOCKING elements
+    computed_elements = computed_elements + 1
+    if (USE_NONBLOCKING_COMMS .and. icall == 2 .and. mod(computed_elements,ELEMENTS_NONBLOCKING_CM_IC) == 0) then
+
+      if(iphase <= 7) call assemble_MPI_vector(myrank,accel_crust_mantle,accel_inner_core, &
+            iproc_xi,iproc_eta,ichunk,addressing, &
+            iboolleft_xi_crust_mantle,iboolright_xi_crust_mantle,iboolleft_eta_crust_mantle,iboolright_eta_crust_mantle, &
+            npoin2D_faces_crust_mantle,npoin2D_xi_crust_mantle,npoin2D_eta_crust_mantle, &
+            iboolfaces_crust_mantle,iboolcorner_crust_mantle, &
+            iboolleft_xi_inner_core,iboolright_xi_inner_core,iboolleft_eta_inner_core,iboolright_eta_inner_core, &
+            npoin2D_faces_inner_core,npoin2D_xi_inner_core,npoin2D_eta_inner_core, &
+            iboolfaces_inner_core,iboolcorner_inner_core, &
+            iprocfrom_faces,iprocto_faces, &
+            iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
+            buffer_send_faces,buffer_received_faces,npoin2D_max_all_CM_IC, &
+            buffer_send_chunkcorners_vector,buffer_recv_chunkcorners_vector, &
+            NUMMSGS_FACES_VAL,NCORNERSCHUNKS_VAL, &
+            NPROC_XI_VAL,NPROC_ETA_VAL,NGLOB1D_RADIAL_CM, &
+            NGLOB1D_RADIAL_IC,NCHUNKS_VAL,iphase)
+
+      if(INCLUDE_CENTRAL_CUBE) then
+          if(iphase > 7 .and. iphase_CC <= 4) &
+            call assemble_MPI_central_cube(ichunk,nb_msgs_theor_in_cube,sender_from_slices_to_cube, &
+                   npoin2D_cube_from_slices,buffer_all_cube_from_slices,buffer_slices,ibool_central_cube, &
+                   receiver_cube_from_slices,ibool_inner_core,idoubling_inner_core, &
+                   ibelm_bottom_inner_core,NSPEC2D_BOTTOM_IC,accel_inner_core,NDIM,iphase_CC)
+      endif
+
+    endif
+
     do k=1,NGLLZ
       do j=1,NGLLY
         do i=1,NGLLX
@@ -159,25 +274,25 @@
           do l=1,NGLLX
             hp1 = hprime_xx(i,l)
             iglob = ibool(l,j,k,ispec)
-            tempx1l = tempx1l + displ(1,iglob)*hp1
-            tempy1l = tempy1l + displ(2,iglob)*hp1
-            tempz1l = tempz1l + displ(3,iglob)*hp1
+            tempx1l = tempx1l + displ_inner_core(1,iglob)*hp1
+            tempy1l = tempy1l + displ_inner_core(2,iglob)*hp1
+            tempz1l = tempz1l + displ_inner_core(3,iglob)*hp1
 !!! can merge these loops because NGLLX = NGLLY = NGLLZ          enddo
 
 !!! can merge these loops because NGLLX = NGLLY = NGLLZ          do l=1,NGLLY
             hp2 = hprime_yy(j,l)
             iglob = ibool(i,l,k,ispec)
-            tempx2l = tempx2l + displ(1,iglob)*hp2
-            tempy2l = tempy2l + displ(2,iglob)*hp2
-            tempz2l = tempz2l + displ(3,iglob)*hp2
+            tempx2l = tempx2l + displ_inner_core(1,iglob)*hp2
+            tempy2l = tempy2l + displ_inner_core(2,iglob)*hp2
+            tempz2l = tempz2l + displ_inner_core(3,iglob)*hp2
 !!! can merge these loops because NGLLX = NGLLY = NGLLZ          enddo
 
 !!! can merge these loops because NGLLX = NGLLY = NGLLZ          do l=1,NGLLZ
             hp3 = hprime_zz(k,l)
             iglob = ibool(i,j,l,ispec)
-            tempx3l = tempx3l + displ(1,iglob)*hp3
-            tempy3l = tempy3l + displ(2,iglob)*hp3
-            tempz3l = tempz3l + displ(3,iglob)*hp3
+            tempx3l = tempx3l + displ_inner_core(1,iglob)*hp3
+            tempy3l = tempy3l + displ_inner_core(2,iglob)*hp3
+            tempz3l = tempz3l + displ_inner_core(3,iglob)*hp3
           enddo
 
 !         get derivatives of ux, uy and uz with respect to x, y and z
@@ -381,9 +496,9 @@
             if(CUSTOM_REAL == SIZE_REAL) then
 
 ! get displacement and multiply by density to compute G tensor
-              sx_l = rho * dble(displ(1,iglob))
-              sy_l = rho * dble(displ(2,iglob))
-              sz_l = rho * dble(displ(3,iglob))
+              sx_l = rho * dble(displ_inner_core(1,iglob))
+              sy_l = rho * dble(displ_inner_core(2,iglob))
+              sz_l = rho * dble(displ_inner_core(3,iglob))
 
 ! compute G tensor from s . g and add to sigma (not symmetric)
               sigma_xx = sigma_xx + sngl(sy_l*gyl + sz_l*gzl)
@@ -408,9 +523,9 @@
             else
 
 ! get displacement and multiply by density to compute G tensor
-              sx_l = rho * displ(1,iglob)
-              sy_l = rho * displ(2,iglob)
-              sz_l = rho * displ(3,iglob)
+              sx_l = rho * displ_inner_core(1,iglob)
+              sy_l = rho * displ_inner_core(2,iglob)
+              sz_l = rho * displ_inner_core(3,iglob)
 
 ! compute G tensor from s . g and add to sigma (not symmetric)
               sigma_xx = sigma_xx + sy_l*gyl + sz_l*gzl
@@ -510,7 +625,7 @@
       do j=1,NGLLY
         do i=1,NGLLX
           iglob = ibool(i,j,k,ispec)
-          accel(:,iglob) = accel(:,iglob) + sum_terms(:,i,j,k)
+          accel_inner_core(:,iglob) = accel_inner_core(:,iglob) + sum_terms(:,i,j,k)
         enddo
       enddo
     enddo

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/compute_forces_inner_core_Dev.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/compute_forces_inner_core_Dev.f90	2011-02-22 01:39:25 UTC (rev 17931)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/compute_forces_inner_core_Dev.f90	2011-02-22 01:39:52 UTC (rev 17932)
@@ -25,10 +25,27 @@
 !
 !=====================================================================
 
-
   subroutine compute_forces_inner_core_Dev(minus_gravity_table,density_table,minus_deriv_gravity_table, &
-          displ,accel,xstore,ystore,zstore, &
+          displ_inner_core,accel_inner_core,xstore,ystore,zstore, &
           xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
+!----------------------
+            is_on_a_slice_edge_inner_core,icall, &
+            accel_crust_mantle,ibool_inner_core,idoubling_inner_core, &
+            myrank,iproc_xi,iproc_eta,ichunk,addressing, &
+            iboolleft_xi_crust_mantle,iboolright_xi_crust_mantle,iboolleft_eta_crust_mantle,iboolright_eta_crust_mantle, &
+            npoin2D_faces_crust_mantle,npoin2D_xi_crust_mantle,npoin2D_eta_crust_mantle, &
+            iboolfaces_crust_mantle,iboolcorner_crust_mantle, &
+            iboolleft_xi_inner_core,iboolright_xi_inner_core,iboolleft_eta_inner_core,iboolright_eta_inner_core, &
+            npoin2D_faces_inner_core,npoin2D_xi_inner_core,npoin2D_eta_inner_core, &
+            iboolfaces_inner_core,iboolcorner_inner_core, &
+            iprocfrom_faces,iprocto_faces, &
+            iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
+            buffer_send_faces,buffer_received_faces,npoin2D_max_all_CM_IC, &
+            buffer_send_chunkcorners_vector,buffer_recv_chunkcorners_vector,iphase, &
+            nb_msgs_theor_in_cube,sender_from_slices_to_cube, &
+            npoin2D_cube_from_slices,buffer_all_cube_from_slices,buffer_slices,ibool_central_cube, &
+            receiver_cube_from_slices,ibelm_bottom_inner_core,NSPEC2D_BOTTOM_INNER_CORE,INCLUDE_CENTRAL_CUBE,iphase_CC, &
+!----------------------
           hprime_xx,hprime_xxT,hprimewgll_xx,hprimewgll_xxT, &
           wgllwgll_xy,wgllwgll_xz,wgllwgll_yz,wgll_cube, &
           kappavstore,muvstore,ibool,idoubling, &
@@ -36,6 +53,8 @@
           one_minus_sum_beta,alphaval,betaval,gammaval,factor_common, &
           vx,vy,vz,vnspec)
 
+! this routine is optimized for NGLLX = NGLLY = NGLLZ = 5 using the Deville et al. (2002) inlined matrix-matrix products
+
   implicit none
 
   include "constants.h"
@@ -45,7 +64,7 @@
   include "OUTPUT_FILES/values_from_mesher.h"
 
   ! displacement and acceleration
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_INNER_CORE) :: displ,accel
+  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_INNER_CORE) :: displ_inner_core,accel_inner_core
   integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE) :: ibool
 
   real(kind=CUSTOM_REAL), dimension(NGLOB_INNER_CORE) :: xstore,ystore,zstore
@@ -127,7 +146,6 @@
   equivalence(newtempy3,E2_mxm_m2_m1_5points)
   equivalence(newtempz3,E3_mxm_m2_m1_5points)
 
-
   real(kind=CUSTOM_REAL), dimension(NDIM,NGLLX,NGLLY,NGLLZ) :: sum_terms
   real(kind=CUSTOM_REAL), dimension(5,NGLLX,NGLLY,NGLLZ) :: epsilondev_loc
   real(kind=CUSTOM_REAL) R_xx_val1,R_yy_val1,R_xx_val2,R_yy_val2,R_xx_val3,R_yy_val3
@@ -157,26 +175,120 @@
   real(kind=CUSTOM_REAL), dimension(NDIM,NGLLX,NGLLY,NGLLZ) :: rho_s_H
   real(kind=CUSTOM_REAL) sigma_yx,sigma_zx,sigma_zy
 
-  !real(kind=CUSTOM_REAL), dimension(5) :: dummy
-
   integer :: int_radius
   integer :: ispec,ispec_strain
   integer :: i,j,k !,l
   integer :: i_SLS,i_memory,imodulo_N_SLS
   integer :: iglob1,iglob2,iglob3,iglob4,iglob5
 
+! this for non blocking MPI
+  integer :: iphase,icall
 
+  integer :: computed_elements
+
+  logical, dimension(NSPEC_INNER_CORE) :: is_on_a_slice_edge_inner_core
+
+  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_CRUST_MANTLE) :: accel_crust_mantle
+
+  integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE) :: ibool_inner_core
+
+  integer, dimension(NSPEC_INNER_CORE) :: idoubling_inner_core
+
+  integer :: ichunk,iproc_xi,iproc_eta,myrank
+
+  integer, dimension(NCHUNKS_VAL,0:NPROC_XI_VAL-1,0:NPROC_ETA_VAL-1) :: addressing
+
+  integer, dimension(NGLOB2DMAX_XMIN_XMAX_CM) :: iboolleft_xi_crust_mantle,iboolright_xi_crust_mantle
+  integer, dimension(NGLOB2DMAX_YMIN_YMAX_CM) :: iboolleft_eta_crust_mantle,iboolright_eta_crust_mantle
+
+  integer, dimension(NGLOB2DMAX_XMIN_XMAX_IC) :: iboolleft_xi_inner_core,iboolright_xi_inner_core
+  integer, dimension(NGLOB2DMAX_YMIN_YMAX_IC) :: iboolleft_eta_inner_core,iboolright_eta_inner_core
+
+  integer npoin2D_faces_crust_mantle(NUMFACES_SHARED)
+  integer npoin2D_faces_inner_core(NUMFACES_SHARED)
+
+  integer, dimension(NB_SQUARE_EDGES_ONEDIR) :: npoin2D_xi_crust_mantle,npoin2D_eta_crust_mantle, &
+       npoin2D_xi_inner_core,npoin2D_eta_inner_core
+
+! communication pattern for faces between chunks
+  integer, dimension(NUMMSGS_FACES_VAL) :: iprocfrom_faces,iprocto_faces
+
+! communication pattern for corners between chunks
+  integer, dimension(NCORNERSCHUNKS_VAL) :: iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners
+
+  integer, dimension(NGLOB1D_RADIAL_CM,NUMCORNERS_SHARED) :: iboolcorner_crust_mantle
+  integer, dimension(NGLOB1D_RADIAL_IC,NUMCORNERS_SHARED) :: iboolcorner_inner_core
+
+  integer, dimension(NGLOB2DMAX_XY_VAL,NUMFACES_SHARED) :: iboolfaces_crust_mantle
+  integer, dimension(NGLOB2DMAX_XY_VAL,NUMFACES_SHARED) :: iboolfaces_inner_core
+
+  integer :: npoin2D_max_all_CM_IC
+  real(kind=CUSTOM_REAL), dimension(NDIM,npoin2D_max_all_CM_IC) :: buffer_send_faces,buffer_received_faces
+
+! size of buffers is the sum of two sizes because we handle two regions in the same MPI call
+  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB1D_RADIAL_CM + NGLOB1D_RADIAL_IC) :: &
+     buffer_send_chunkcorners_vector,buffer_recv_chunkcorners_vector
+
+! for matching with central cube in inner core
+  integer nb_msgs_theor_in_cube, npoin2D_cube_from_slices,iphase_CC
+  integer, dimension(nb_msgs_theor_in_cube) :: sender_from_slices_to_cube
+  double precision, dimension(npoin2D_cube_from_slices,NDIM) :: buffer_slices
+  double precision, dimension(npoin2D_cube_from_slices,NDIM,nb_msgs_theor_in_cube) :: buffer_all_cube_from_slices
+  integer, dimension(nb_msgs_theor_in_cube,npoin2D_cube_from_slices):: ibool_central_cube
+  integer receiver_cube_from_slices
+  logical :: INCLUDE_CENTRAL_CUBE
+
+! local to global mapping
+  integer NSPEC2D_BOTTOM_INNER_CORE
+  integer, dimension(NSPEC2D_BOTTOM_INNER_CORE) :: ibelm_bottom_inner_core
+
 ! ****************************************************
 !   big loop over all spectral elements in the solid
 ! ****************************************************
 
   imodulo_N_SLS = mod(N_SLS,3)
 
+  computed_elements = 0
+
   do ispec = 1,NSPEC_INNER_CORE
 
+! hide communications by computing the edges first
+    if((icall == 2 .and. is_on_a_slice_edge_inner_core(ispec)) .or. &
+       (icall == 1 .and. .not. is_on_a_slice_edge_inner_core(ispec))) cycle
+
     ! exclude fictitious elements in central cube
     if(idoubling(ispec) /= IFLAG_IN_FICTITIOUS_CUBE) then
 
+! process the communications every ELEMENTS_NONBLOCKING elements
+    computed_elements = computed_elements + 1
+    if (USE_NONBLOCKING_COMMS .and. icall == 2 .and. mod(computed_elements,ELEMENTS_NONBLOCKING_CM_IC) == 0) then
+
+      if(iphase <= 7) call assemble_MPI_vector(myrank,accel_crust_mantle,accel_inner_core, &
+            iproc_xi,iproc_eta,ichunk,addressing, &
+            iboolleft_xi_crust_mantle,iboolright_xi_crust_mantle,iboolleft_eta_crust_mantle,iboolright_eta_crust_mantle, &
+            npoin2D_faces_crust_mantle,npoin2D_xi_crust_mantle,npoin2D_eta_crust_mantle, &
+            iboolfaces_crust_mantle,iboolcorner_crust_mantle, &
+            iboolleft_xi_inner_core,iboolright_xi_inner_core,iboolleft_eta_inner_core,iboolright_eta_inner_core, &
+            npoin2D_faces_inner_core,npoin2D_xi_inner_core,npoin2D_eta_inner_core, &
+            iboolfaces_inner_core,iboolcorner_inner_core, &
+            iprocfrom_faces,iprocto_faces, &
+            iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
+            buffer_send_faces,buffer_received_faces,npoin2D_max_all_CM_IC, &
+            buffer_send_chunkcorners_vector,buffer_recv_chunkcorners_vector, &
+            NUMMSGS_FACES_VAL,NCORNERSCHUNKS_VAL, &
+            NPROC_XI_VAL,NPROC_ETA_VAL,NGLOB1D_RADIAL_CM, &
+            NGLOB1D_RADIAL_IC,NCHUNKS_VAL,iphase)
+
+      if(INCLUDE_CENTRAL_CUBE) then
+          if(iphase > 7 .and. iphase_CC <= 4) &
+            call assemble_MPI_central_cube(ichunk,nb_msgs_theor_in_cube,sender_from_slices_to_cube, &
+                   npoin2D_cube_from_slices,buffer_all_cube_from_slices,buffer_slices,ibool_central_cube, &
+                   receiver_cube_from_slices,ibool_inner_core,idoubling_inner_core, &
+                   ibelm_bottom_inner_core,NSPEC2D_BOTTOM_IC,accel_inner_core,NDIM,iphase_CC)
+      endif
+
+    endif
+
       ! subroutines adapted from Deville, Fischer and Mund, High-order methods
       ! for incompressible fluid flow, Cambridge University Press (2002),
       ! pages 386 and 389 and Figure 8.3.1
@@ -185,9 +297,9 @@
 ! way 1:
 !          do i=1,NGLLX
 !              iglob = ibool(i,j,k,ispec)
-!              dummyx_loc(i,j,k) = displ(1,iglob)
-!              dummyy_loc(i,j,k) = displ(2,iglob)
-!              dummyz_loc(i,j,k) = displ(3,iglob)
+!              dummyx_loc(i,j,k) = displ_inner_core(1,iglob)
+!              dummyy_loc(i,j,k) = displ_inner_core(2,iglob)
+!              dummyz_loc(i,j,k) = displ_inner_core(3,iglob)
 !          enddo
 
 ! way 2:
@@ -198,25 +310,25 @@
         iglob4 = ibool(4,j,k,ispec)
         iglob5 = ibool(5,j,k,ispec)
 
-        dummyx_loc(1,j,k) = displ(1,iglob1)
-        dummyy_loc(1,j,k) = displ(2,iglob1)
-        dummyz_loc(1,j,k) = displ(3,iglob1)
+        dummyx_loc(1,j,k) = displ_inner_core(1,iglob1)
+        dummyy_loc(1,j,k) = displ_inner_core(2,iglob1)
+        dummyz_loc(1,j,k) = displ_inner_core(3,iglob1)
 
-        dummyx_loc(2,j,k) = displ(1,iglob2)
-        dummyy_loc(2,j,k) = displ(2,iglob2)
-        dummyz_loc(2,j,k) = displ(3,iglob2)
+        dummyx_loc(2,j,k) = displ_inner_core(1,iglob2)
+        dummyy_loc(2,j,k) = displ_inner_core(2,iglob2)
+        dummyz_loc(2,j,k) = displ_inner_core(3,iglob2)
 
-        dummyx_loc(3,j,k) = displ(1,iglob3)
-        dummyy_loc(3,j,k) = displ(2,iglob3)
-        dummyz_loc(3,j,k) = displ(3,iglob3)
+        dummyx_loc(3,j,k) = displ_inner_core(1,iglob3)
+        dummyy_loc(3,j,k) = displ_inner_core(2,iglob3)
+        dummyz_loc(3,j,k) = displ_inner_core(3,iglob3)
 
-        dummyx_loc(4,j,k) = displ(1,iglob4)
-        dummyy_loc(4,j,k) = displ(2,iglob4)
-        dummyz_loc(4,j,k) = displ(3,iglob4)
+        dummyx_loc(4,j,k) = displ_inner_core(1,iglob4)
+        dummyy_loc(4,j,k) = displ_inner_core(2,iglob4)
+        dummyz_loc(4,j,k) = displ_inner_core(3,iglob4)
 
-        dummyx_loc(5,j,k) = displ(1,iglob5)
-        dummyy_loc(5,j,k) = displ(2,iglob5)
-        dummyz_loc(5,j,k) = displ(3,iglob5)
+        dummyx_loc(5,j,k) = displ_inner_core(1,iglob5)
+        dummyy_loc(5,j,k) = displ_inner_core(2,iglob5)
+        dummyz_loc(5,j,k) = displ_inner_core(3,iglob5)
 
 
         enddo
@@ -539,9 +651,9 @@
               ! distinguish between single and double precision for reals
               if(CUSTOM_REAL == SIZE_REAL) then
                 ! get displacement and multiply by density to compute G tensor
-                sx_l = rho * dble(displ(1,iglob1))
-                sy_l = rho * dble(displ(2,iglob1))
-                sz_l = rho * dble(displ(3,iglob1))
+                sx_l = rho * dble(displ_inner_core(1,iglob1))
+                sy_l = rho * dble(displ_inner_core(2,iglob1))
+                sz_l = rho * dble(displ_inner_core(3,iglob1))
 
                 ! compute G tensor from s . g and add to sigma (not symmetric)
                 sigma_xx = sigma_xx + sngl(sy_l*gyl + sz_l*gzl)
@@ -566,9 +678,9 @@
               else
 
                 ! get displacement and multiply by density to compute G tensor
-                sx_l = rho * displ(1,iglob1)
-                sy_l = rho * displ(2,iglob1)
-                sz_l = rho * displ(3,iglob1)
+                sx_l = rho * displ_inner_core(1,iglob1)
+                sy_l = rho * displ_inner_core(2,iglob1)
+                sz_l = rho * displ_inner_core(3,iglob1)
 
                 ! compute G tensor from s . g and add to sigma (not symmetric)
                 sigma_xx = sigma_xx + sy_l*gyl + sz_l*gzl
@@ -705,15 +817,15 @@
 ! way 1:
 !          do i=1,NGLLX
 !            iglob = ibool(i,j,k,ispec)
-!            accel(:,iglob) = accel(:,iglob) + sum_terms(:,i,j,k)
+!            accel_inner_core(:,iglob) = accel_inner_core(:,iglob) + sum_terms(:,i,j,k)
 !          enddo
 
 ! way 2:
-          accel(:,ibool(1,j,k,ispec)) = accel(:,ibool(1,j,k,ispec)) + sum_terms(:,1,j,k)
-          accel(:,ibool(2,j,k,ispec)) = accel(:,ibool(2,j,k,ispec)) + sum_terms(:,2,j,k)
-          accel(:,ibool(3,j,k,ispec)) = accel(:,ibool(3,j,k,ispec)) + sum_terms(:,3,j,k)
-          accel(:,ibool(4,j,k,ispec)) = accel(:,ibool(4,j,k,ispec)) + sum_terms(:,4,j,k)
-          accel(:,ibool(5,j,k,ispec)) = accel(:,ibool(5,j,k,ispec)) + sum_terms(:,5,j,k)
+          accel_inner_core(:,ibool(1,j,k,ispec)) = accel_inner_core(:,ibool(1,j,k,ispec)) + sum_terms(:,1,j,k)
+          accel_inner_core(:,ibool(2,j,k,ispec)) = accel_inner_core(:,ibool(2,j,k,ispec)) + sum_terms(:,2,j,k)
+          accel_inner_core(:,ibool(3,j,k,ispec)) = accel_inner_core(:,ibool(3,j,k,ispec)) + sum_terms(:,3,j,k)
+          accel_inner_core(:,ibool(4,j,k,ispec)) = accel_inner_core(:,ibool(4,j,k,ispec)) + sum_terms(:,4,j,k)
+          accel_inner_core(:,ibool(5,j,k,ispec)) = accel_inner_core(:,ibool(5,j,k,ispec)) + sum_terms(:,5,j,k)
 
         enddo
       enddo

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/compute_forces_outer_core_Dev.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/compute_forces_outer_core_Dev.f90	2011-02-22 01:39:25 UTC (rev 17931)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/compute_forces_outer_core_Dev.f90	2011-02-22 01:39:52 UTC (rev 17932)
@@ -46,6 +46,8 @@
                             wgllwgll_xy,wgllwgll_xz,wgllwgll_yz,wgll_cube, &
                             ibool,MOVIE_VOLUME)
 
+! this routine is optimized for NGLLX = NGLLY = NGLLZ = 5 using the Deville et al. (2002) inlined matrix-matrix products
+
   implicit none
 
   include "constants.h"

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/specfem3D.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/specfem3D.f90	2011-02-22 01:39:25 UTC (rev 17931)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/specfem3D.f90	2011-02-22 01:39:52 UTC (rev 17932)
@@ -511,7 +511,9 @@
   real :: percentage_edge
 
 ! assembling phase number for non blocking MPI
-  integer :: iphase,icall
+! iphase is for the crust_mantle, outer_core and inner_core regions
+! iphase_CC is for the central cube
+  integer :: iphase,iphase_CC,icall
 
 ! -------- arrays specific to each region here -----------
 
@@ -2128,8 +2130,9 @@
       time = (dble(it-1)*DT-t0)*scale_t_inv
     endif
 
-  iphase = 0 ! do not start any non blocking communications at this stage
-  icall = 1  ! compute all the outer elements first in the case of non blocking MPI
+    iphase = 0 ! do not start any non blocking communications at this stage
+    icall = 1  ! compute all the outer elements first in the case of non blocking MPI
+
     if( USE_DEVILLE_PRODUCTS_VAL ) then
       ! uses Deville et al. (2002) routine
       call compute_forces_outer_core_Dev(time,deltat,two_omega_earth, &
@@ -2307,7 +2310,8 @@
             NGLOB2DMAX_XMIN_XMAX(IREGION_OUTER_CORE),NGLOB2DMAX_YMIN_YMAX(IREGION_OUTER_CORE), &
             NGLOB2DMAX_XY,NCHUNKS_VAL,iphase)
 
-    icall = 2 ! compute all the inner elements in the case of non blocking MPI
+    icall = 2 ! now compute all the inner elements in the case of non blocking MPI
+
     if( USE_DEVILLE_PRODUCTS_VAL ) then
         ! uses Deville et al. (2002) routine
       call compute_forces_outer_core_Dev(time,deltat,two_omega_earth, &
@@ -2462,6 +2466,10 @@
 
     ! for anisotropy and gravity, x y and z contain r theta and phi
 
+    iphase = 0 ! do not start any non blocking communications at this stage
+    iphase_CC = 0 ! do not start any non blocking communications at this stage
+    icall = 1  ! compute all the outer elements first in the case of non blocking MPI
+
     if( USE_DEVILLE_PRODUCTS_VAL ) then
       call compute_forces_crust_mantle_Dev(minus_gravity_table,density_table,minus_deriv_gravity_table, &
           displ_crust_mantle,accel_crust_mantle, &
@@ -2469,6 +2477,24 @@
           xix_crust_mantle,xiy_crust_mantle,xiz_crust_mantle, &
           etax_crust_mantle,etay_crust_mantle,etaz_crust_mantle, &
           gammax_crust_mantle,gammay_crust_mantle,gammaz_crust_mantle, &
+!----------------------
+            is_on_a_slice_edge_crust_mantle,icall, &
+            accel_inner_core,ibool_inner_core,idoubling_inner_core, &
+            myrank,iproc_xi,iproc_eta,ichunk,addressing, &
+            iboolleft_xi_crust_mantle,iboolright_xi_crust_mantle,iboolleft_eta_crust_mantle,iboolright_eta_crust_mantle, &
+            npoin2D_faces_crust_mantle,npoin2D_xi_crust_mantle,npoin2D_eta_crust_mantle, &
+            iboolfaces_crust_mantle,iboolcorner_crust_mantle, &
+            iboolleft_xi_inner_core,iboolright_xi_inner_core,iboolleft_eta_inner_core,iboolright_eta_inner_core, &
+            npoin2D_faces_inner_core,npoin2D_xi_inner_core,npoin2D_eta_inner_core, &
+            iboolfaces_inner_core,iboolcorner_inner_core, &
+            iprocfrom_faces,iprocto_faces, &
+            iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
+            buffer_send_faces,buffer_received_faces,npoin2D_max_all_CM_IC, &
+            buffer_send_chunkcorners_vector,buffer_recv_chunkcorners_vector,iphase, &
+            nb_msgs_theor_in_cube,sender_from_slices_to_cube, &
+            npoin2D_cube_from_slices,buffer_all_cube_from_slices,buffer_slices,ibool_central_cube, &
+            receiver_cube_from_slices,ibelm_bottom_inner_core,NSPEC2D_BOTTOM_IC,INCLUDE_CENTRAL_CUBE,iphase_CC, &
+!----------------------
           hprime_xx,hprime_xxT, &
           hprimewgll_xx,hprimewgll_xxT, &
           wgllwgll_xy,wgllwgll_xz,wgllwgll_yz,wgll_cube, &
@@ -2494,6 +2520,24 @@
           xix_crust_mantle,xiy_crust_mantle,xiz_crust_mantle, &
           etax_crust_mantle,etay_crust_mantle,etaz_crust_mantle, &
           gammax_crust_mantle,gammay_crust_mantle,gammaz_crust_mantle, &
+!----------------------
+            is_on_a_slice_edge_crust_mantle,icall, &
+            accel_inner_core,ibool_inner_core,idoubling_inner_core, &
+            myrank,iproc_xi,iproc_eta,ichunk,addressing, &
+            iboolleft_xi_crust_mantle,iboolright_xi_crust_mantle,iboolleft_eta_crust_mantle,iboolright_eta_crust_mantle, &
+            npoin2D_faces_crust_mantle,npoin2D_xi_crust_mantle,npoin2D_eta_crust_mantle, &
+            iboolfaces_crust_mantle,iboolcorner_crust_mantle, &
+            iboolleft_xi_inner_core,iboolright_xi_inner_core,iboolleft_eta_inner_core,iboolright_eta_inner_core, &
+            npoin2D_faces_inner_core,npoin2D_xi_inner_core,npoin2D_eta_inner_core, &
+            iboolfaces_inner_core,iboolcorner_inner_core, &
+            iprocfrom_faces,iprocto_faces, &
+            iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
+            buffer_send_faces,buffer_received_faces,npoin2D_max_all_CM_IC, &
+            buffer_send_chunkcorners_vector,buffer_recv_chunkcorners_vector,iphase, &
+            nb_msgs_theor_in_cube,sender_from_slices_to_cube, &
+            npoin2D_cube_from_slices,buffer_all_cube_from_slices,buffer_slices,ibool_central_cube, &
+            receiver_cube_from_slices,ibelm_bottom_inner_core,NSPEC2D_BOTTOM_IC,INCLUDE_CENTRAL_CUBE,iphase_CC, &
+!----------------------
           hprime_xx,hprime_yy,hprime_zz, &
           hprimewgll_xx,hprimewgll_yy,hprimewgll_zz, &
           wgllwgll_xy,wgllwgll_xz,wgllwgll_yz,wgll_cube, &
@@ -2523,6 +2567,24 @@
           xix_crust_mantle,xiy_crust_mantle,xiz_crust_mantle, &
           etax_crust_mantle,etay_crust_mantle,etaz_crust_mantle, &
           gammax_crust_mantle,gammay_crust_mantle,gammaz_crust_mantle, &
+!----------------------
+            is_on_a_slice_edge_crust_mantle,icall, &
+            accel_inner_core,ibool_inner_core,idoubling_inner_core, &
+            myrank,iproc_xi,iproc_eta,ichunk,addressing, &
+            iboolleft_xi_crust_mantle,iboolright_xi_crust_mantle,iboolleft_eta_crust_mantle,iboolright_eta_crust_mantle, &
+            npoin2D_faces_crust_mantle,npoin2D_xi_crust_mantle,npoin2D_eta_crust_mantle, &
+            iboolfaces_crust_mantle,iboolcorner_crust_mantle, &
+            iboolleft_xi_inner_core,iboolright_xi_inner_core,iboolleft_eta_inner_core,iboolright_eta_inner_core, &
+            npoin2D_faces_inner_core,npoin2D_xi_inner_core,npoin2D_eta_inner_core, &
+            iboolfaces_inner_core,iboolcorner_inner_core, &
+            iprocfrom_faces,iprocto_faces, &
+            iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
+            buffer_send_faces,buffer_received_faces,npoin2D_max_all_CM_IC, &
+            buffer_send_chunkcorners_vector,buffer_recv_chunkcorners_vector,iphase, &
+            nb_msgs_theor_in_cube,sender_from_slices_to_cube, &
+            npoin2D_cube_from_slices,buffer_all_cube_from_slices,buffer_slices,ibool_central_cube, &
+            receiver_cube_from_slices,ibelm_bottom_inner_core,NSPEC2D_BOTTOM_IC,INCLUDE_CENTRAL_CUBE,iphase_CC, &
+!----------------------
           hprime_xx,hprime_xxT, &
           hprimewgll_xx,hprimewgll_xxT, &
           wgllwgll_xy,wgllwgll_xz,wgllwgll_yz,wgll_cube, &
@@ -2548,6 +2610,24 @@
           xix_crust_mantle,xiy_crust_mantle,xiz_crust_mantle, &
           etax_crust_mantle,etay_crust_mantle,etaz_crust_mantle, &
           gammax_crust_mantle,gammay_crust_mantle,gammaz_crust_mantle, &
+!----------------------
+            is_on_a_slice_edge_crust_mantle,icall, &
+            accel_inner_core,ibool_inner_core,idoubling_inner_core, &
+            myrank,iproc_xi,iproc_eta,ichunk,addressing, &
+            iboolleft_xi_crust_mantle,iboolright_xi_crust_mantle,iboolleft_eta_crust_mantle,iboolright_eta_crust_mantle, &
+            npoin2D_faces_crust_mantle,npoin2D_xi_crust_mantle,npoin2D_eta_crust_mantle, &
+            iboolfaces_crust_mantle,iboolcorner_crust_mantle, &
+            iboolleft_xi_inner_core,iboolright_xi_inner_core,iboolleft_eta_inner_core,iboolright_eta_inner_core, &
+            npoin2D_faces_inner_core,npoin2D_xi_inner_core,npoin2D_eta_inner_core, &
+            iboolfaces_inner_core,iboolcorner_inner_core, &
+            iprocfrom_faces,iprocto_faces, &
+            iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
+            buffer_send_faces,buffer_received_faces,npoin2D_max_all_CM_IC, &
+            buffer_send_chunkcorners_vector,buffer_recv_chunkcorners_vector,iphase, &
+            nb_msgs_theor_in_cube,sender_from_slices_to_cube, &
+            npoin2D_cube_from_slices,buffer_all_cube_from_slices,buffer_slices,ibool_central_cube, &
+            receiver_cube_from_slices,ibelm_bottom_inner_core,NSPEC2D_BOTTOM_IC,INCLUDE_CENTRAL_CUBE,iphase_CC, &
+!----------------------
           hprime_xx,hprime_yy,hprime_zz, &
           hprimewgll_xx,hprimewgll_yy,hprimewgll_zz, &
           wgllwgll_xy,wgllwgll_xz,wgllwgll_yz,wgll_cube, &
@@ -2570,32 +2650,6 @@
       endif
     endif
 
-    ! Stacey
-    if(NCHUNKS_VAL /= 6 .and. ABSORBING_CONDITIONS) then
-      call compute_stacey_crust_mantle(ichunk,SIMULATION_TYPE, &
-                              NSTEP,it,SAVE_FORWARD,ibool_crust_mantle, &
-                              veloc_crust_mantle,accel_crust_mantle,b_accel_crust_mantle, &
-                              jacobian2D_xmin_crust_mantle,jacobian2D_xmax_crust_mantle, &
-                              jacobian2D_ymin_crust_mantle,jacobian2D_ymax_crust_mantle, &
-                              wgllwgll_xz,wgllwgll_yz, &
-                              normal_xmin_crust_mantle,normal_xmax_crust_mantle, &
-                              normal_ymin_crust_mantle,normal_ymax_crust_mantle, &
-                              rho_vp_crust_mantle,rho_vs_crust_mantle, &
-                              ibelm_xmin_crust_mantle,ibelm_xmax_crust_mantle, &
-                              ibelm_ymin_crust_mantle,ibelm_ymax_crust_mantle, &
-                              nimin_crust_mantle,nimax_crust_mantle, &
-                              njmin_crust_mantle,njmax_crust_mantle, &
-                              nkmin_xi_crust_mantle,nkmin_eta_crust_mantle, &
-                              nspec2D_xmin_crust_mantle,nspec2D_xmax_crust_mantle, &
-                              nspec2D_ymin_crust_mantle,nspec2D_ymax_crust_mantle, &
-                              reclen_xmin_crust_mantle,reclen_xmax_crust_mantle, &
-                              reclen_ymin_crust_mantle,reclen_ymax_crust_mantle, &
-                              nabs_xmin_cm,nabs_xmax_cm,nabs_ymin_cm,nabs_ymax_cm, &
-                              absorb_xmin_crust_mantle5,absorb_xmax_crust_mantle5, &
-                              absorb_ymin_crust_mantle5,absorb_ymax_crust_mantle5)
-    endif ! Stacey conditions
-
-
     ! Deville routine
     if( USE_DEVILLE_PRODUCTS_VAL ) then
       call compute_forces_inner_core_Dev(minus_gravity_table,density_table,minus_deriv_gravity_table, &
@@ -2604,6 +2658,24 @@
           xix_inner_core,xiy_inner_core,xiz_inner_core, &
           etax_inner_core,etay_inner_core,etaz_inner_core, &
           gammax_inner_core,gammay_inner_core,gammaz_inner_core, &
+!----------------------
+            is_on_a_slice_edge_inner_core,icall, &
+            accel_crust_mantle,ibool_inner_core,idoubling_inner_core, &
+            myrank,iproc_xi,iproc_eta,ichunk,addressing, &
+            iboolleft_xi_crust_mantle,iboolright_xi_crust_mantle,iboolleft_eta_crust_mantle,iboolright_eta_crust_mantle, &
+            npoin2D_faces_crust_mantle,npoin2D_xi_crust_mantle,npoin2D_eta_crust_mantle, &
+            iboolfaces_crust_mantle,iboolcorner_crust_mantle, &
+            iboolleft_xi_inner_core,iboolright_xi_inner_core,iboolleft_eta_inner_core,iboolright_eta_inner_core, &
+            npoin2D_faces_inner_core,npoin2D_xi_inner_core,npoin2D_eta_inner_core, &
+            iboolfaces_inner_core,iboolcorner_inner_core, &
+            iprocfrom_faces,iprocto_faces, &
+            iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
+            buffer_send_faces,buffer_received_faces,npoin2D_max_all_CM_IC, &
+            buffer_send_chunkcorners_vector,buffer_recv_chunkcorners_vector,iphase, &
+            nb_msgs_theor_in_cube,sender_from_slices_to_cube, &
+            npoin2D_cube_from_slices,buffer_all_cube_from_slices,buffer_slices,ibool_central_cube, &
+            receiver_cube_from_slices,ibelm_bottom_inner_core,NSPEC2D_BOTTOM_IC,INCLUDE_CENTRAL_CUBE,iphase_CC, &
+!----------------------
           hprime_xx,hprime_xxT,hprimewgll_xx,hprimewgll_xxT, &
           wgllwgll_xy,wgllwgll_xz,wgllwgll_yz,wgll_cube, &
           kappavstore_inner_core,muvstore_inner_core,ibool_inner_core,idoubling_inner_core, &
@@ -2622,6 +2694,24 @@
           xix_inner_core,xiy_inner_core,xiz_inner_core, &
           etax_inner_core,etay_inner_core,etaz_inner_core, &
           gammax_inner_core,gammay_inner_core,gammaz_inner_core, &
+!----------------------
+            is_on_a_slice_edge_inner_core,icall, &
+            accel_crust_mantle,ibool_inner_core,idoubling_inner_core, &
+            myrank,iproc_xi,iproc_eta,ichunk,addressing, &
+            iboolleft_xi_crust_mantle,iboolright_xi_crust_mantle,iboolleft_eta_crust_mantle,iboolright_eta_crust_mantle, &
+            npoin2D_faces_crust_mantle,npoin2D_xi_crust_mantle,npoin2D_eta_crust_mantle, &
+            iboolfaces_crust_mantle,iboolcorner_crust_mantle, &
+            iboolleft_xi_inner_core,iboolright_xi_inner_core,iboolleft_eta_inner_core,iboolright_eta_inner_core, &
+            npoin2D_faces_inner_core,npoin2D_xi_inner_core,npoin2D_eta_inner_core, &
+            iboolfaces_inner_core,iboolcorner_inner_core, &
+            iprocfrom_faces,iprocto_faces, &
+            iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
+            buffer_send_faces,buffer_received_faces,npoin2D_max_all_CM_IC, &
+            buffer_send_chunkcorners_vector,buffer_recv_chunkcorners_vector,iphase, &
+            nb_msgs_theor_in_cube,sender_from_slices_to_cube, &
+            npoin2D_cube_from_slices,buffer_all_cube_from_slices,buffer_slices,ibool_central_cube, &
+            receiver_cube_from_slices,ibelm_bottom_inner_core,NSPEC2D_BOTTOM_IC,INCLUDE_CENTRAL_CUBE,iphase_CC, &
+!----------------------
           hprime_xx,hprime_yy,hprime_zz,hprimewgll_xx,hprimewgll_yy,hprimewgll_zz, &
           wgllwgll_xy,wgllwgll_xz,wgllwgll_yz,wgll_cube, &
           kappavstore_inner_core,muvstore_inner_core,ibool_inner_core,idoubling_inner_core, &
@@ -2643,6 +2733,24 @@
           xix_inner_core,xiy_inner_core,xiz_inner_core, &
           etax_inner_core,etay_inner_core,etaz_inner_core, &
           gammax_inner_core,gammay_inner_core,gammaz_inner_core, &
+!----------------------
+            is_on_a_slice_edge_inner_core,icall, &
+            accel_crust_mantle,ibool_inner_core,idoubling_inner_core, &
+            myrank,iproc_xi,iproc_eta,ichunk,addressing, &
+            iboolleft_xi_crust_mantle,iboolright_xi_crust_mantle,iboolleft_eta_crust_mantle,iboolright_eta_crust_mantle, &
+            npoin2D_faces_crust_mantle,npoin2D_xi_crust_mantle,npoin2D_eta_crust_mantle, &
+            iboolfaces_crust_mantle,iboolcorner_crust_mantle, &
+            iboolleft_xi_inner_core,iboolright_xi_inner_core,iboolleft_eta_inner_core,iboolright_eta_inner_core, &
+            npoin2D_faces_inner_core,npoin2D_xi_inner_core,npoin2D_eta_inner_core, &
+            iboolfaces_inner_core,iboolcorner_inner_core, &
+            iprocfrom_faces,iprocto_faces, &
+            iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
+            buffer_send_faces,buffer_received_faces,npoin2D_max_all_CM_IC, &
+            buffer_send_chunkcorners_vector,buffer_recv_chunkcorners_vector,iphase, &
+            nb_msgs_theor_in_cube,sender_from_slices_to_cube, &
+            npoin2D_cube_from_slices,buffer_all_cube_from_slices,buffer_slices,ibool_central_cube, &
+            receiver_cube_from_slices,ibelm_bottom_inner_core,NSPEC2D_BOTTOM_IC,INCLUDE_CENTRAL_CUBE,iphase_CC, &
+!----------------------
           hprime_xx,hprime_xxT,hprimewgll_xx,hprimewgll_xxT, &
           wgllwgll_xy,wgllwgll_xz,wgllwgll_yz,wgll_cube, &
           kappavstore_inner_core,muvstore_inner_core,ibool_inner_core,idoubling_inner_core, &
@@ -2661,6 +2769,24 @@
           xix_inner_core,xiy_inner_core,xiz_inner_core, &
           etax_inner_core,etay_inner_core,etaz_inner_core, &
           gammax_inner_core,gammay_inner_core,gammaz_inner_core, &
+!----------------------
+            is_on_a_slice_edge_inner_core,icall, &
+            accel_crust_mantle,ibool_inner_core,idoubling_inner_core, &
+            myrank,iproc_xi,iproc_eta,ichunk,addressing, &
+            iboolleft_xi_crust_mantle,iboolright_xi_crust_mantle,iboolleft_eta_crust_mantle,iboolright_eta_crust_mantle, &
+            npoin2D_faces_crust_mantle,npoin2D_xi_crust_mantle,npoin2D_eta_crust_mantle, &
+            iboolfaces_crust_mantle,iboolcorner_crust_mantle, &
+            iboolleft_xi_inner_core,iboolright_xi_inner_core,iboolleft_eta_inner_core,iboolright_eta_inner_core, &
+            npoin2D_faces_inner_core,npoin2D_xi_inner_core,npoin2D_eta_inner_core, &
+            iboolfaces_inner_core,iboolcorner_inner_core, &
+            iprocfrom_faces,iprocto_faces, &
+            iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
+            buffer_send_faces,buffer_received_faces,npoin2D_max_all_CM_IC, &
+            buffer_send_chunkcorners_vector,buffer_recv_chunkcorners_vector,iphase, &
+            nb_msgs_theor_in_cube,sender_from_slices_to_cube, &
+            npoin2D_cube_from_slices,buffer_all_cube_from_slices,buffer_slices,ibool_central_cube, &
+            receiver_cube_from_slices,ibelm_bottom_inner_core,NSPEC2D_BOTTOM_IC,INCLUDE_CENTRAL_CUBE,iphase_CC, &
+!----------------------
           hprime_xx,hprime_yy,hprime_zz,hprimewgll_xx,hprimewgll_yy,hprimewgll_zz, &
           wgllwgll_xy,wgllwgll_xz,wgllwgll_yz,wgll_cube, &
           kappavstore_inner_core,muvstore_inner_core,ibool_inner_core,idoubling_inner_core, &
@@ -2675,6 +2801,31 @@
       endif
     endif
 
+    ! Stacey
+    if(NCHUNKS_VAL /= 6 .and. ABSORBING_CONDITIONS) then
+      call compute_stacey_crust_mantle(ichunk,SIMULATION_TYPE, &
+                              NSTEP,it,SAVE_FORWARD,ibool_crust_mantle, &
+                              veloc_crust_mantle,accel_crust_mantle,b_accel_crust_mantle, &
+                              jacobian2D_xmin_crust_mantle,jacobian2D_xmax_crust_mantle, &
+                              jacobian2D_ymin_crust_mantle,jacobian2D_ymax_crust_mantle, &
+                              wgllwgll_xz,wgllwgll_yz, &
+                              normal_xmin_crust_mantle,normal_xmax_crust_mantle, &
+                              normal_ymin_crust_mantle,normal_ymax_crust_mantle, &
+                              rho_vp_crust_mantle,rho_vs_crust_mantle, &
+                              ibelm_xmin_crust_mantle,ibelm_xmax_crust_mantle, &
+                              ibelm_ymin_crust_mantle,ibelm_ymax_crust_mantle, &
+                              nimin_crust_mantle,nimax_crust_mantle, &
+                              njmin_crust_mantle,njmax_crust_mantle, &
+                              nkmin_xi_crust_mantle,nkmin_eta_crust_mantle, &
+                              nspec2D_xmin_crust_mantle,nspec2D_xmax_crust_mantle, &
+                              nspec2D_ymin_crust_mantle,nspec2D_ymax_crust_mantle, &
+                              reclen_xmin_crust_mantle,reclen_xmax_crust_mantle, &
+                              reclen_ymin_crust_mantle,reclen_ymax_crust_mantle, &
+                              nabs_xmin_cm,nabs_xmax_cm,nabs_ymin_cm,nabs_ymax_cm, &
+                              absorb_xmin_crust_mantle5,absorb_xmax_crust_mantle5, &
+                              absorb_ymin_crust_mantle5,absorb_ymax_crust_mantle5)
+    endif ! Stacey conditions
+
     ! add the sources
     if (SIMULATION_TYPE == 1) &
       call compute_add_sources(myrank,NSOURCES, &
@@ -2776,9 +2927,224 @@
 
     ! assemble all the contributions between slices using MPI
 
+! assemble all the contributions between slices using MPI
+! crust/mantle and inner core handled in the same call
+! in order to reduce the number of MPI messages by 2
+  if(USE_NONBLOCKING_COMMS) then
+
+    iphase = 1 ! initialize the non blocking communication counter
+    iphase_CC = 1 ! initialize the non blocking communication counter for the central cube
+
+! start the non blocking communications
+    call assemble_MPI_vector(myrank,accel_crust_mantle,accel_inner_core, &
+            iproc_xi,iproc_eta,ichunk,addressing, &
+            iboolleft_xi_crust_mantle,iboolright_xi_crust_mantle,iboolleft_eta_crust_mantle,iboolright_eta_crust_mantle, &
+            npoin2D_faces_crust_mantle,npoin2D_xi_crust_mantle,npoin2D_eta_crust_mantle, &
+            iboolfaces_crust_mantle,iboolcorner_crust_mantle, &
+            iboolleft_xi_inner_core,iboolright_xi_inner_core,iboolleft_eta_inner_core,iboolright_eta_inner_core, &
+            npoin2D_faces_inner_core,npoin2D_xi_inner_core,npoin2D_eta_inner_core, &
+            iboolfaces_inner_core,iboolcorner_inner_core, &
+            iprocfrom_faces,iprocto_faces, &
+            iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
+            buffer_send_faces,buffer_received_faces,npoin2D_max_all_CM_IC, &
+            buffer_send_chunkcorners_vector,buffer_recv_chunkcorners_vector, &
+            NUMMSGS_FACES,NCORNERSCHUNKS, &
+            NPROC_XI_VAL,NPROC_ETA_VAL,NGLOB1D_RADIAL(IREGION_CRUST_MANTLE), &
+            NGLOB1D_RADIAL(IREGION_INNER_CORE),NCHUNKS_VAL,iphase)
+
+    icall = 2 ! now compute all the inner elements in the case of non blocking MPI
+
+    ! compute internal forces in the solid regions
+
+    ! for anisotropy and gravity, x y and z contain r theta and phi
+
+    if( USE_DEVILLE_PRODUCTS_VAL ) then
+      call compute_forces_crust_mantle_Dev(minus_gravity_table,density_table,minus_deriv_gravity_table, &
+          displ_crust_mantle,accel_crust_mantle, &
+          xstore_crust_mantle,ystore_crust_mantle,zstore_crust_mantle, &
+          xix_crust_mantle,xiy_crust_mantle,xiz_crust_mantle, &
+          etax_crust_mantle,etay_crust_mantle,etaz_crust_mantle, &
+          gammax_crust_mantle,gammay_crust_mantle,gammaz_crust_mantle, &
+!----------------------
+            is_on_a_slice_edge_crust_mantle,icall, &
+            accel_inner_core,ibool_inner_core,idoubling_inner_core, &
+            myrank,iproc_xi,iproc_eta,ichunk,addressing, &
+            iboolleft_xi_crust_mantle,iboolright_xi_crust_mantle,iboolleft_eta_crust_mantle,iboolright_eta_crust_mantle, &
+            npoin2D_faces_crust_mantle,npoin2D_xi_crust_mantle,npoin2D_eta_crust_mantle, &
+            iboolfaces_crust_mantle,iboolcorner_crust_mantle, &
+            iboolleft_xi_inner_core,iboolright_xi_inner_core,iboolleft_eta_inner_core,iboolright_eta_inner_core, &
+            npoin2D_faces_inner_core,npoin2D_xi_inner_core,npoin2D_eta_inner_core, &
+            iboolfaces_inner_core,iboolcorner_inner_core, &
+            iprocfrom_faces,iprocto_faces, &
+            iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
+            buffer_send_faces,buffer_received_faces,npoin2D_max_all_CM_IC, &
+            buffer_send_chunkcorners_vector,buffer_recv_chunkcorners_vector,iphase, &
+            nb_msgs_theor_in_cube,sender_from_slices_to_cube, &
+            npoin2D_cube_from_slices,buffer_all_cube_from_slices,buffer_slices,ibool_central_cube, &
+            receiver_cube_from_slices,ibelm_bottom_inner_core,NSPEC2D_BOTTOM_IC,INCLUDE_CENTRAL_CUBE,iphase_CC, &
+!----------------------
+          hprime_xx,hprime_xxT, &
+          hprimewgll_xx,hprimewgll_xxT, &
+          wgllwgll_xy,wgllwgll_xz,wgllwgll_yz,wgll_cube, &
+          kappavstore_crust_mantle,kappahstore_crust_mantle,muvstore_crust_mantle, &
+          muhstore_crust_mantle,eta_anisostore_crust_mantle, &
+          c11store_crust_mantle,c12store_crust_mantle,c13store_crust_mantle, &
+          c14store_crust_mantle,c15store_crust_mantle,c16store_crust_mantle, &
+          c22store_crust_mantle,c23store_crust_mantle,c24store_crust_mantle, &
+          c25store_crust_mantle,c26store_crust_mantle,c33store_crust_mantle, &
+          c34store_crust_mantle,c35store_crust_mantle,c36store_crust_mantle, &
+          c44store_crust_mantle,c45store_crust_mantle,c46store_crust_mantle, &
+          c55store_crust_mantle,c56store_crust_mantle,c66store_crust_mantle, &
+          ibool_crust_mantle,idoubling_crust_mantle, &
+          R_memory_crust_mantle,epsilondev_crust_mantle, &
+          eps_trace_over_3_crust_mantle,one_minus_sum_beta_crust_mantle, &
+          alphaval,betaval,gammaval,factor_common_crust_mantle, &
+          size(factor_common_crust_mantle,2), size(factor_common_crust_mantle,3), &
+          size(factor_common_crust_mantle,4), size(factor_common_crust_mantle,5) )
+    else
+      call compute_forces_crust_mantle(minus_gravity_table,density_table,minus_deriv_gravity_table, &
+          displ_crust_mantle,accel_crust_mantle, &
+          xstore_crust_mantle,ystore_crust_mantle,zstore_crust_mantle, &
+          xix_crust_mantle,xiy_crust_mantle,xiz_crust_mantle, &
+          etax_crust_mantle,etay_crust_mantle,etaz_crust_mantle, &
+          gammax_crust_mantle,gammay_crust_mantle,gammaz_crust_mantle, &
+!----------------------
+            is_on_a_slice_edge_crust_mantle,icall, &
+            accel_inner_core,ibool_inner_core,idoubling_inner_core, &
+            myrank,iproc_xi,iproc_eta,ichunk,addressing, &
+            iboolleft_xi_crust_mantle,iboolright_xi_crust_mantle,iboolleft_eta_crust_mantle,iboolright_eta_crust_mantle, &
+            npoin2D_faces_crust_mantle,npoin2D_xi_crust_mantle,npoin2D_eta_crust_mantle, &
+            iboolfaces_crust_mantle,iboolcorner_crust_mantle, &
+            iboolleft_xi_inner_core,iboolright_xi_inner_core,iboolleft_eta_inner_core,iboolright_eta_inner_core, &
+            npoin2D_faces_inner_core,npoin2D_xi_inner_core,npoin2D_eta_inner_core, &
+            iboolfaces_inner_core,iboolcorner_inner_core, &
+            iprocfrom_faces,iprocto_faces, &
+            iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
+            buffer_send_faces,buffer_received_faces,npoin2D_max_all_CM_IC, &
+            buffer_send_chunkcorners_vector,buffer_recv_chunkcorners_vector,iphase, &
+            nb_msgs_theor_in_cube,sender_from_slices_to_cube, &
+            npoin2D_cube_from_slices,buffer_all_cube_from_slices,buffer_slices,ibool_central_cube, &
+            receiver_cube_from_slices,ibelm_bottom_inner_core,NSPEC2D_BOTTOM_IC,INCLUDE_CENTRAL_CUBE,iphase_CC, &
+!----------------------
+          hprime_xx,hprime_yy,hprime_zz, &
+          hprimewgll_xx,hprimewgll_yy,hprimewgll_zz, &
+          wgllwgll_xy,wgllwgll_xz,wgllwgll_yz,wgll_cube, &
+          kappavstore_crust_mantle,kappahstore_crust_mantle,muvstore_crust_mantle, &
+          muhstore_crust_mantle,eta_anisostore_crust_mantle, &
+          c11store_crust_mantle,c12store_crust_mantle,c13store_crust_mantle, &
+          c14store_crust_mantle,c15store_crust_mantle,c16store_crust_mantle, &
+          c22store_crust_mantle,c23store_crust_mantle,c24store_crust_mantle, &
+          c25store_crust_mantle,c26store_crust_mantle,c33store_crust_mantle, &
+          c34store_crust_mantle,c35store_crust_mantle,c36store_crust_mantle, &
+          c44store_crust_mantle,c45store_crust_mantle,c46store_crust_mantle, &
+          c55store_crust_mantle,c56store_crust_mantle,c66store_crust_mantle, &
+          ibool_crust_mantle,idoubling_crust_mantle, &
+          R_memory_crust_mantle,epsilondev_crust_mantle, &
+          eps_trace_over_3_crust_mantle,one_minus_sum_beta_crust_mantle, &
+          alphaval,betaval,gammaval,factor_common_crust_mantle, &
+          size(factor_common_crust_mantle,2), size(factor_common_crust_mantle,3), &
+          size(factor_common_crust_mantle,4), size(factor_common_crust_mantle,5) )
+    endif
+
+    ! Deville routine
+    if( USE_DEVILLE_PRODUCTS_VAL ) then
+      call compute_forces_inner_core_Dev(minus_gravity_table,density_table,minus_deriv_gravity_table, &
+          displ_inner_core,accel_inner_core, &
+          xstore_inner_core,ystore_inner_core,zstore_inner_core, &
+          xix_inner_core,xiy_inner_core,xiz_inner_core, &
+          etax_inner_core,etay_inner_core,etaz_inner_core, &
+          gammax_inner_core,gammay_inner_core,gammaz_inner_core, &
+!----------------------
+            is_on_a_slice_edge_inner_core,icall, &
+            accel_crust_mantle,ibool_inner_core,idoubling_inner_core, &
+            myrank,iproc_xi,iproc_eta,ichunk,addressing, &
+            iboolleft_xi_crust_mantle,iboolright_xi_crust_mantle,iboolleft_eta_crust_mantle,iboolright_eta_crust_mantle, &
+            npoin2D_faces_crust_mantle,npoin2D_xi_crust_mantle,npoin2D_eta_crust_mantle, &
+            iboolfaces_crust_mantle,iboolcorner_crust_mantle, &
+            iboolleft_xi_inner_core,iboolright_xi_inner_core,iboolleft_eta_inner_core,iboolright_eta_inner_core, &
+            npoin2D_faces_inner_core,npoin2D_xi_inner_core,npoin2D_eta_inner_core, &
+            iboolfaces_inner_core,iboolcorner_inner_core, &
+            iprocfrom_faces,iprocto_faces, &
+            iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
+            buffer_send_faces,buffer_received_faces,npoin2D_max_all_CM_IC, &
+            buffer_send_chunkcorners_vector,buffer_recv_chunkcorners_vector,iphase, &
+            nb_msgs_theor_in_cube,sender_from_slices_to_cube, &
+            npoin2D_cube_from_slices,buffer_all_cube_from_slices,buffer_slices,ibool_central_cube, &
+            receiver_cube_from_slices,ibelm_bottom_inner_core,NSPEC2D_BOTTOM_IC,INCLUDE_CENTRAL_CUBE,iphase_CC, &
+!----------------------
+          hprime_xx,hprime_xxT,hprimewgll_xx,hprimewgll_xxT, &
+          wgllwgll_xy,wgllwgll_xz,wgllwgll_yz,wgll_cube, &
+          kappavstore_inner_core,muvstore_inner_core,ibool_inner_core,idoubling_inner_core, &
+          c11store_inner_core,c33store_inner_core,c12store_inner_core, &
+          c13store_inner_core,c44store_inner_core, &
+          R_memory_inner_core,epsilondev_inner_core, eps_trace_over_3_inner_core,&
+          one_minus_sum_beta_inner_core, &
+          alphaval,betaval,gammaval, &
+          factor_common_inner_core, &
+          size(factor_common_inner_core,2), size(factor_common_inner_core,3), &
+          size(factor_common_inner_core,4), size(factor_common_inner_core,5) )
+    else
+      call compute_forces_inner_core(minus_gravity_table,density_table,minus_deriv_gravity_table, &
+          displ_inner_core,accel_inner_core, &
+          xstore_inner_core,ystore_inner_core,zstore_inner_core, &
+          xix_inner_core,xiy_inner_core,xiz_inner_core, &
+          etax_inner_core,etay_inner_core,etaz_inner_core, &
+          gammax_inner_core,gammay_inner_core,gammaz_inner_core, &
+!----------------------
+            is_on_a_slice_edge_inner_core,icall, &
+            accel_crust_mantle,ibool_inner_core,idoubling_inner_core, &
+            myrank,iproc_xi,iproc_eta,ichunk,addressing, &
+            iboolleft_xi_crust_mantle,iboolright_xi_crust_mantle,iboolleft_eta_crust_mantle,iboolright_eta_crust_mantle, &
+            npoin2D_faces_crust_mantle,npoin2D_xi_crust_mantle,npoin2D_eta_crust_mantle, &
+            iboolfaces_crust_mantle,iboolcorner_crust_mantle, &
+            iboolleft_xi_inner_core,iboolright_xi_inner_core,iboolleft_eta_inner_core,iboolright_eta_inner_core, &
+            npoin2D_faces_inner_core,npoin2D_xi_inner_core,npoin2D_eta_inner_core, &
+            iboolfaces_inner_core,iboolcorner_inner_core, &
+            iprocfrom_faces,iprocto_faces, &
+            iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
+            buffer_send_faces,buffer_received_faces,npoin2D_max_all_CM_IC, &
+            buffer_send_chunkcorners_vector,buffer_recv_chunkcorners_vector,iphase, &
+            nb_msgs_theor_in_cube,sender_from_slices_to_cube, &
+            npoin2D_cube_from_slices,buffer_all_cube_from_slices,buffer_slices,ibool_central_cube, &
+            receiver_cube_from_slices,ibelm_bottom_inner_core,NSPEC2D_BOTTOM_IC,INCLUDE_CENTRAL_CUBE,iphase_CC, &
+!----------------------
+          hprime_xx,hprime_yy,hprime_zz,hprimewgll_xx,hprimewgll_yy,hprimewgll_zz, &
+          wgllwgll_xy,wgllwgll_xz,wgllwgll_yz,wgll_cube, &
+          kappavstore_inner_core,muvstore_inner_core,ibool_inner_core,idoubling_inner_core, &
+          c11store_inner_core,c33store_inner_core,c12store_inner_core, &
+          c13store_inner_core,c44store_inner_core, &
+          R_memory_inner_core,epsilondev_inner_core, eps_trace_over_3_inner_core,&
+          one_minus_sum_beta_inner_core, &
+          alphaval,betaval,gammaval, &
+          factor_common_inner_core, &
+          size(factor_common_inner_core,2), size(factor_common_inner_core,3), &
+          size(factor_common_inner_core,4), size(factor_common_inner_core,5) )
+    endif
+
+! assemble all the contributions between slices using MPI
+! crust/mantle and inner core handled in the same call
+! in order to reduce the number of MPI messages by 2
+    do while (iphase <= 7) ! make sure the last communications are finished and processed
+      call assemble_MPI_vector(myrank,accel_crust_mantle,accel_inner_core, &
+            iproc_xi,iproc_eta,ichunk,addressing, &
+            iboolleft_xi_crust_mantle,iboolright_xi_crust_mantle,iboolleft_eta_crust_mantle,iboolright_eta_crust_mantle, &
+            npoin2D_faces_crust_mantle,npoin2D_xi_crust_mantle,npoin2D_eta_crust_mantle, &
+            iboolfaces_crust_mantle,iboolcorner_crust_mantle, &
+            iboolleft_xi_inner_core,iboolright_xi_inner_core,iboolleft_eta_inner_core,iboolright_eta_inner_core, &
+            npoin2D_faces_inner_core,npoin2D_xi_inner_core,npoin2D_eta_inner_core, &
+            iboolfaces_inner_core,iboolcorner_inner_core, &
+            iprocfrom_faces,iprocto_faces, &
+            iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
+            buffer_send_faces,buffer_received_faces,npoin2D_max_all_CM_IC, &
+            buffer_send_chunkcorners_vector,buffer_recv_chunkcorners_vector, &
+            NUMMSGS_FACES,NCORNERSCHUNKS, &
+            NPROC_XI_VAL,NPROC_ETA_VAL,NGLOB1D_RADIAL(IREGION_CRUST_MANTLE), &
+            NGLOB1D_RADIAL(IREGION_INNER_CORE),NCHUNKS_VAL,iphase)
+    enddo
+  else
     ! crust/mantle and inner core handled in the same call
     ! in order to reduce the number of MPI messages by 2
-    call assemble_MPI_vector(myrank, &
+    call assemble_MPI_vector_block(myrank, &
             accel_crust_mantle,NGLOB_CRUST_MANTLE, &
             accel_inner_core,NGLOB_INNER_CORE, &
             iproc_xi,iproc_eta,ichunk,addressing, &
@@ -2801,20 +3167,27 @@
             NGLOB1D_RADIAL(IREGION_INNER_CORE), &
             NGLOB2DMAX_XMIN_XMAX(IREGION_INNER_CORE),NGLOB2DMAX_YMIN_YMAX(IREGION_INNER_CORE), &
             NGLOB2DMAX_XY,NCHUNKS_VAL)
+  endif
 
     !---
     !---  use buffers to assemble forces with the central cube
     !---
 
-    if(INCLUDE_CENTRAL_CUBE) then
-      call assemble_MPI_central_cube(ichunk,nb_msgs_theor_in_cube, sender_from_slices_to_cube, &
-                      npoin2D_cube_from_slices, buffer_all_cube_from_slices, &
-                      buffer_slices, buffer_slices2, ibool_central_cube, &
-                      receiver_cube_from_slices, ibool_inner_core, &
-                      idoubling_inner_core, NSPEC_INNER_CORE, &
-                      ibelm_bottom_inner_core, NSPEC2D_BOTTOM(IREGION_INNER_CORE), &
-                      NGLOB_INNER_CORE,accel_inner_core,NDIM)
-    endif   ! end of assembling forces with the central cube
+  if(INCLUDE_CENTRAL_CUBE) then
+    if(USE_NONBLOCKING_COMMS) then
+      do while (iphase_CC <= 4) ! make sure the last communications are finished and processed
+        call assemble_MPI_central_cube(ichunk,nb_msgs_theor_in_cube,sender_from_slices_to_cube, &
+          npoin2D_cube_from_slices,buffer_all_cube_from_slices,buffer_slices,ibool_central_cube, &
+          receiver_cube_from_slices,ibool_inner_core,idoubling_inner_core, &
+          ibelm_bottom_inner_core,NSPEC2D_BOTTOM(IREGION_INNER_CORE),accel_inner_core,NDIM,iphase_CC)
+      enddo
+    else
+      call assemble_MPI_central_cube_block(ichunk,nb_msgs_theor_in_cube,sender_from_slices_to_cube, &
+        npoin2D_cube_from_slices,buffer_all_cube_from_slices,buffer_slices,buffer_slices2,ibool_central_cube, &
+        receiver_cube_from_slices,ibool_inner_core,idoubling_inner_core,NSPEC_INNER_CORE, &
+        ibelm_bottom_inner_core,NSPEC2D_BOTTOM(IREGION_INNER_CORE),NGLOB_INNER_CORE,accel_inner_core,NDIM)
+    endif
+  endif   ! end of assembling forces with the central cube
 
 ! way 1:
 !    do i=1,NGLOB_CRUST_MANTLE
@@ -2865,7 +3238,7 @@
 
       ! crust/mantle and inner core handled in the same call
       ! in order to reduce the number of MPI messages by 2
-      call assemble_MPI_vector(myrank, &
+      call assemble_MPI_vector_block(myrank, &
             b_accel_crust_mantle,NGLOB_CRUST_MANTLE, &
             b_accel_inner_core,NGLOB_INNER_CORE, &
             iproc_xi,iproc_eta,ichunk,addressing, &
@@ -2895,7 +3268,8 @@
 
       if(INCLUDE_CENTRAL_CUBE) then
 
-        call assemble_MPI_central_cube(ichunk,nb_msgs_theor_in_cube, sender_from_slices_to_cube, &
+!! DK DK 33333333333333333333333 this should be converted to non blocking (and thus should have iphase etc)
+        call assemble_MPI_central_cube_block(ichunk,nb_msgs_theor_in_cube, sender_from_slices_to_cube, &
                          npoin2D_cube_from_slices, buffer_all_cube_from_slices, &
                          buffer_slices, buffer_slices2, ibool_central_cube, &
                          receiver_cube_from_slices, ibool_inner_core, &