[cig-commits] r12916 - in seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta: . setup src

Fri Sep 19 03:17:21 PDT 2008

Author: dkomati1
Date: 2008-09-19 03:17:21 -0700 (Fri, 19 Sep 2008)
New Revision: 12916

Added:
   seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/fix_non_blocking_central_cube.f90
   seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/fix_non_blocking_slices.f90
Removed:
   seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/fix_non_blocking_arrays.f90
Modified:
   seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/Makefile
   seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/setup/constants.h
   seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/assemble_MPI_central_cube.F90
   seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/assemble_MPI_central_cube_block.F90
   seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/assemble_MPI_scalar.F90
   seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/assemble_MPI_vector.F90
   seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/compute_forces_CM_IC.F90
   seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/compute_forces_OC.F90
   seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/create_central_cube_buffers.F90
   seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/extract_all_seismos_from_large_file.f90
   seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/main_program.F90
   seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/specfem3D.F90
Log:
routines to assemble faces between chunks and to assemble with the central cube converted to non blocking MPI;
all calls to MPI_ISEND() converted to MPI_ISSEND(), as suggested by www.idris.fr


Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/Makefile
===================================================================

--- seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/Makefile	2008-09-18 21:34:10 UTC (rev 12915)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/Makefile	2008-09-19 10:17:21 UTC (rev 12916)
@@ -122,7 +122,8 @@
 	$O/calendar.o \
 	$O/create_name_database.o \
 	$O/debug_with_opendx.o \
-	$O/fix_non_blocking_arrays.o \
+	$O/fix_non_blocking_slices.o \
+	$O/fix_non_blocking_central_cube.o \
 	$O/write_AVS_DX_surface_data.o \
 	$O/write_AVS_DX_global_chunks_data.o \
 	$O/write_AVS_DX_global_faces_data.o \
@@ -361,9 +362,12 @@
 $O/debug_with_opendx.o: $(SPECINC)/constants.h $S/debug_with_opendx.f90
 	${FCCOMPILE_CHECK} -c -o $O/debug_with_opendx.o ${FCFLAGS_f90} $S/debug_with_opendx.f90
 
-$O/fix_non_blocking_arrays.o: $(SPECINC)/constants.h $S/fix_non_blocking_arrays.f90
-	${FCCOMPILE_CHECK} -c -o $O/fix_non_blocking_arrays.o ${FCFLAGS_f90} $S/fix_non_blocking_arrays.f90
+$O/fix_non_blocking_slices.o: $(SPECINC)/constants.h $S/fix_non_blocking_slices.f90
+	${FCCOMPILE_CHECK} -c -o $O/fix_non_blocking_slices.o ${FCFLAGS_f90} $S/fix_non_blocking_slices.f90
 
+$O/fix_non_blocking_central_cube.o: $(SPECINC)/constants.h $S/fix_non_blocking_central_cube.f90
+	${FCCOMPILE_CHECK} -c -o $O/fix_non_blocking_central_cube.o ${FCFLAGS_f90} $S/fix_non_blocking_central_cube.f90
+
 $O/write_AVS_DX_surface_data.o: $(SPECINC)/constants.h $S/write_AVS_DX_surface_data.f90
 	${FCCOMPILE_CHECK} -c -o $O/write_AVS_DX_surface_data.o ${FCFLAGS_f90} $S/write_AVS_DX_surface_data.f90
 

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/setup/constants.h
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/setup/constants.h	2008-09-18 21:34:10 UTC (rev 12915)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/setup/constants.h	2008-09-19 10:17:21 UTC (rev 12916)
@@ -33,7 +33,7 @@
 
 ! this for non blocking assembly
   logical, parameter :: USE_NONBLOCKING_COMMS = .true.
-  integer, parameter :: ELEMENTS_BETWEEN_NONBLOCKING = 200
+  integer, parameter :: ELEMENTS_BETWEEN_NONBLOCKING = 3000
 
   logical, parameter :: DEBUG_NONBLOCKING_COMMS = .false.
   logical, parameter :: DEBUG_USING_OPENDX = .false.

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/assemble_MPI_central_cube.F90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/assemble_MPI_central_cube.F90	2008-09-18 21:34:10 UTC (rev 12915)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/assemble_MPI_central_cube.F90	2008-09-19 10:17:21 UTC (rev 12916)
@@ -25,10 +25,10 @@
 !
 !=====================================================================
 
-subroutine assemble_MPI_central_cube(ichunk,nb_msgs_theor_in_cube, sender_from_slices_to_cube, &
-  npoin2D_cube_from_slices, buffer_all_cube_from_slices, buffer_slices, buffer_slices2, ibool_central_cube, &
-  receiver_cube_from_slices, ibool_inner_core, idoubling_inner_core, NSPEC_INNER_CORE, &
-  ibelm_bottom_inner_core, NSPEC2D_BOTTOM_INNER_CORE,NGLOB_INNER_CORE,vector_assemble,ndim_assemble)
+  subroutine assemble_MPI_central_cube(ichunk,nb_msgs_theor_in_cube,sender_from_slices_to_cube, &
+    npoin2D_cube_from_slices,buffer_all_cube_from_slices,buffer_slices,ibool_central_cube, &
+    receiver_cube_from_slices,ibool_inner_core,idoubling_inner_core, &
+    ibelm_bottom_inner_core,NSPEC2D_BOTTOM_INNER_CORE,vector_assemble,ndim_assemble,iphase_CC)
 
   implicit none
 
@@ -38,16 +38,19 @@
 #endif
   include 'constants.h'
 
+! include values created by the mesher
+  include "values_from_mesher.h"
+
 ! for matching with central cube in inner core
-  integer ichunk, nb_msgs_theor_in_cube, npoin2D_cube_from_slices
+  integer ichunk, nb_msgs_theor_in_cube, npoin2D_cube_from_slices,iphase_CC
   integer, dimension(nb_msgs_theor_in_cube) :: sender_from_slices_to_cube
-  double precision, dimension(npoin2D_cube_from_slices,NDIM) :: buffer_slices,buffer_slices2
-  double precision, dimension(nb_msgs_theor_in_cube,npoin2D_cube_from_slices,NDIM) :: buffer_all_cube_from_slices
+  double precision, dimension(npoin2D_cube_from_slices,ndim_assemble) :: buffer_slices
+  double precision, dimension(npoin2D_cube_from_slices,ndim_assemble,nb_msgs_theor_in_cube) :: buffer_all_cube_from_slices
   integer, dimension(nb_msgs_theor_in_cube,npoin2D_cube_from_slices):: ibool_central_cube
   integer receiver_cube_from_slices
 
 ! local to global mapping
-  integer NSPEC_INNER_CORE,NSPEC2D_BOTTOM_INNER_CORE, NGLOB_INNER_CORE
+  integer NSPEC2D_BOTTOM_INNER_CORE
   integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE) :: ibool_inner_core
   integer, dimension(NSPEC_INNER_CORE) :: idoubling_inner_core
   integer, dimension(NSPEC2D_BOTTOM_INNER_CORE) :: ibelm_bottom_inner_core
@@ -64,6 +67,11 @@
 
 ! MPI status of messages to be received
 #ifdef USE_MPI
+  integer, save :: request_send,request_receive
+! maximum value of nb_msgs_theor_in_cube is 5 (when NPROC_XI == 1)
+! therefore NPROC_XI+4 is always large enough
+  integer, dimension(NPROC_XI_VAL+4), save :: request_send_array,request_receive_array
+  logical :: flag_result_test
   integer, dimension(MPI_STATUS_SIZE) :: msg_status
   integer :: ier
 #endif
@@ -72,60 +80,70 @@
   logical, dimension(NGLOB_INNER_CORE) :: mask
 
 !---
-!---  now use buffers to assemble mass matrix with central cube once and for all
+!---  use buffers to assemble mass matrix with central cube once and for all
 !---
 
+  if(iphase_CC == 1) then
+
 ! on chunks AB and AB_ANTIPODE, receive all the messages from slices
   if(ichunk == CHUNK_AB .or. ichunk == CHUNK_AB_ANTIPODE) then
-
     do imsg = 1,nb_msgs_theor_in_cube-1
-
 ! receive buffers from slices
-    sender = sender_from_slices_to_cube(imsg)
+      sender = sender_from_slices_to_cube(imsg)
 #ifdef USE_MPI
-    call MPI_RECV(buffer_slices, &
+      call MPI_IRECV(buffer_all_cube_from_slices(:,:,imsg), &
                 ndim_assemble*npoin2D_cube_from_slices,MPI_DOUBLE_PRECISION,sender, &
-                itag,MPI_COMM_WORLD,msg_status,ier)
+                itag,MPI_COMM_WORLD,request_receive_array(imsg),ier)
 #endif
-
-! copy buffer in 2D array for each slice
-    buffer_all_cube_from_slices(imsg,:,1:ndim_assemble) = buffer_slices(:,1:ndim_assemble)
-
     enddo
   endif
 
 ! send info to central cube from all the slices except those in CHUNK_AB & CHUNK_AB_ANTIPODE
   if(ichunk /= CHUNK_AB .and. ichunk /= CHUNK_AB_ANTIPODE) then
-
 ! for bottom elements in contact with central cube from the slices side
     ipoin = 0
     do ispec2D = 1,NSPEC2D_BOTTOM_INNER_CORE
-
       ispec = ibelm_bottom_inner_core(ispec2D)
-
 ! only for DOFs exactly on surface of central cube (bottom of these elements)
       k = 1
       do j = 1,NGLLY
         do i = 1,NGLLX
           ipoin = ipoin + 1
-          buffer_slices(ipoin,1:ndim_assemble) = dble(vector_assemble(1:ndim_assemble,ibool_inner_core(i,j,k,ispec)))
+          buffer_slices(ipoin,:) = dble(vector_assemble(1:ndim_assemble,ibool_inner_core(i,j,k,ispec)))
         enddo
       enddo
     enddo
-
 ! send buffer to central cube
     receiver = receiver_cube_from_slices
 #ifdef USE_MPI
-    call MPI_SEND(buffer_slices,ndim_assemble*npoin2D_cube_from_slices, &
-              MPI_DOUBLE_PRECISION,receiver,itag,MPI_COMM_WORLD,ier)
+    call MPI_ISSEND(buffer_slices,ndim_assemble*npoin2D_cube_from_slices, &
+              MPI_DOUBLE_PRECISION,receiver,itag,MPI_COMM_WORLD,request_send,ier)
 #endif
-
  endif  ! end sending info to central cube
 
+  iphase_CC = iphase_CC + 1
+  return ! exit because we have started some communications therefore we need some time
 
+  endif !!!!!!!!! end of iphase_CC 1
+
+  if(iphase_CC == 2) then
+
+#ifdef USE_MPI
+  if(ichunk /= CHUNK_AB .and. ichunk /= CHUNK_AB_ANTIPODE) then
+    call MPI_TEST(request_send,flag_result_test,msg_status,ier)
+    if(.not. flag_result_test) return ! exit if message not sent yet
+  endif
+
+  if(ichunk == CHUNK_AB .or. ichunk == CHUNK_AB_ANTIPODE) then
+    do imsg = 1,nb_msgs_theor_in_cube-1
+      call MPI_TEST(request_receive_array(imsg),flag_result_test,msg_status,ier)
+      if(.not. flag_result_test) return ! exit if message not received yet
+    enddo
+  endif
+#endif
+
 ! exchange of their bottom faces between chunks AB and AB_ANTIPODE
   if(ichunk == CHUNK_AB .or. ichunk == CHUNK_AB_ANTIPODE) then
-
     ipoin = 0
     do ispec = NSPEC_INNER_CORE, 1, -1
       if (idoubling_inner_core(ispec) == IFLAG_BOTTOM_CENTRAL_CUBE) then
@@ -133,28 +151,45 @@
         do j = 1,NGLLY
           do i = 1,NGLLX
             ipoin = ipoin + 1
-            buffer_slices(ipoin,1:ndim_assemble) = dble(vector_assemble(1:ndim_assemble,ibool_inner_core(i,j,k,ispec)))
+            buffer_slices(ipoin,:) = dble(vector_assemble(1:ndim_assemble,ibool_inner_core(i,j,k,ispec)))
           enddo
         enddo
       endif
     enddo
-
     sender = sender_from_slices_to_cube(nb_msgs_theor_in_cube)
-
 #ifdef USE_MPI
-    call MPI_SENDRECV(buffer_slices,ndim_assemble*npoin2D_cube_from_slices,MPI_DOUBLE_PRECISION,receiver_cube_from_slices, &
-        itag,buffer_slices2,ndim_assemble*npoin2D_cube_from_slices,&
-        MPI_DOUBLE_PRECISION,sender,itag,MPI_COMM_WORLD,msg_status,ier)
-#endif
+!   call MPI_SENDRECV(buffer_slices,ndim_assemble*npoin2D_cube_from_slices,MPI_DOUBLE_PRECISION,receiver_cube_from_slices, &
+!       itag,buffer_slices2,ndim_assemble*npoin2D_cube_from_slices,&
+!       MPI_DOUBLE_PRECISION,sender,itag,MPI_COMM_WORLD,msg_status,ier)
 
-   buffer_all_cube_from_slices(nb_msgs_theor_in_cube,:,1:ndim_assemble) = buffer_slices2(:,1:ndim_assemble)
+    call MPI_IRECV(buffer_all_cube_from_slices(:,:,nb_msgs_theor_in_cube), &
+        ndim_assemble*npoin2D_cube_from_slices,MPI_DOUBLE_PRECISION,sender,itag,MPI_COMM_WORLD,request_receive,ier)
+!! DK DK this merged with previous statement
+!   buffer_all_cube_from_slices(:,:,nb_msgs_theor_in_cube) = buffer_slices2(:,:)
 
+    call MPI_ISSEND(buffer_slices,ndim_assemble*npoin2D_cube_from_slices,MPI_DOUBLE_PRECISION,receiver_cube_from_slices, &
+        itag,MPI_COMM_WORLD,request_send,ier)
+#endif
   endif
 
+  iphase_CC = iphase_CC + 1
+  return ! exit because we have started some communications therefore we need some time
+
+  endif !!!!!!!!! end of iphase_CC 2
+
+  if(iphase_CC == 3) then
+
 !--- now we need to assemble the contributions
 
   if(ichunk == CHUNK_AB .or. ichunk == CHUNK_AB_ANTIPODE) then
 
+#ifdef USE_MPI
+    call MPI_TEST(request_send,flag_result_test,msg_status,ier)
+    if(.not. flag_result_test) return ! exit if message not sent yet
+    call MPI_TEST(request_receive,flag_result_test,msg_status,ier)
+    if(.not. flag_result_test) return ! exit if message not received yet
+#endif
+
     do idimension = 1,ndim_assemble
 ! erase contributions to central cube array
       array_central_cube(:) = 0._CUSTOM_REAL
@@ -164,9 +199,9 @@
       do imsg = 1,nb_msgs_theor_in_cube-1
         do ipoin = 1,npoin2D_cube_from_slices
           if(CUSTOM_REAL == SIZE_REAL) then
-            array_central_cube(ibool_central_cube(imsg,ipoin)) = sngl(buffer_all_cube_from_slices(imsg,ipoin,idimension))
+            array_central_cube(ibool_central_cube(imsg,ipoin)) = sngl(buffer_all_cube_from_slices(ipoin,idimension,imsg))
           else
-            array_central_cube(ibool_central_cube(imsg,ipoin)) = buffer_all_cube_from_slices(imsg,ipoin,idimension)
+            array_central_cube(ibool_central_cube(imsg,ipoin)) = buffer_all_cube_from_slices(ipoin,idimension,imsg)
           endif
         enddo
       enddo
@@ -178,11 +213,11 @@
           if(CUSTOM_REAL == SIZE_REAL) then
             array_central_cube(ibool_central_cube(nb_msgs_theor_in_cube,ipoin)) = &
             array_central_cube(ibool_central_cube(nb_msgs_theor_in_cube,ipoin)) + &
-            sngl(buffer_all_cube_from_slices(nb_msgs_theor_in_cube,ipoin,idimension))
+            sngl(buffer_all_cube_from_slices(ipoin,idimension,nb_msgs_theor_in_cube))
           else
             array_central_cube(ibool_central_cube(nb_msgs_theor_in_cube,ipoin)) = &
             array_central_cube(ibool_central_cube(nb_msgs_theor_in_cube,ipoin)) + &
-            buffer_all_cube_from_slices(nb_msgs_theor_in_cube,ipoin,idimension)
+            buffer_all_cube_from_slices(ipoin,idimension,nb_msgs_theor_in_cube)
           endif
           mask(ibool_central_cube(nb_msgs_theor_in_cube,ipoin)) = .true.
         endif
@@ -206,7 +241,7 @@
 ! copy sum back
       do imsg = 1,nb_msgs_theor_in_cube-1
         do ipoin = 1,npoin2D_cube_from_slices
-          buffer_all_cube_from_slices(imsg,ipoin,idimension) = vector_assemble(idimension,ibool_central_cube(imsg,ipoin))
+          buffer_all_cube_from_slices(ipoin,idimension,imsg) = vector_assemble(idimension,ibool_central_cube(imsg,ipoin))
         enddo
       enddo
 
@@ -218,59 +253,94 @@
 
 ! receive info from central cube on all the slices except those in CHUNK_AB & CHUNK_AB_ANTIPODE
   if(ichunk /= CHUNK_AB .and. ichunk /= CHUNK_AB_ANTIPODE) then
-
 ! receive buffers from slices
   sender = receiver_cube_from_slices
 #ifdef USE_MPI
-  call MPI_RECV(buffer_slices, &
+  call MPI_IRECV(buffer_slices, &
               ndim_assemble*npoin2D_cube_from_slices,MPI_DOUBLE_PRECISION,sender, &
-              itag,MPI_COMM_WORLD,msg_status,ier)
+              itag,MPI_COMM_WORLD,request_receive,ier)
 #endif
+! for bottom elements in contact with central cube from the slices side
+!   ipoin = 0
+!   do ispec2D = 1,NSPEC2D_BOTTOM_INNER_CORE
+!     ispec = ibelm_bottom_inner_core(ispec2D)
+! only for DOFs exactly on surface of central cube (bottom of these elements)
+!     k = 1
+!     do j = 1,NGLLY
+!       do i = 1,NGLLX
+!         ipoin = ipoin + 1
+! distinguish between single and double precision for reals
+!         if(CUSTOM_REAL == SIZE_REAL) then
+!           vector_assemble(:,ibool_inner_core(i,j,k,ispec)) = sngl(buffer_slices(ipoin,:))
+!         else
+!           vector_assemble(:,ibool_inner_core(i,j,k,ispec)) = buffer_slices(ipoin,:)
+!         endif
+!       enddo
+!     enddo
+!   enddo
+ endif  ! end receiving info from central cube
 
+!------- send info back from central cube to slices
+
+! on chunk AB & CHUNK_AB_ANTIPODE, send all the messages to slices
+  if(ichunk == CHUNK_AB .or. ichunk == CHUNK_AB_ANTIPODE) then
+    do imsg = 1,nb_msgs_theor_in_cube-1
+! send buffers to slices
+      receiver = sender_from_slices_to_cube(imsg)
+#ifdef USE_MPI
+      call MPI_ISSEND(buffer_all_cube_from_slices(:,:,imsg),ndim_assemble*npoin2D_cube_from_slices, &
+              MPI_DOUBLE_PRECISION,receiver,itag,MPI_COMM_WORLD,request_send_array(imsg),ier)
+#endif
+    enddo
+  endif
+
+  iphase_CC = iphase_CC + 1
+  return ! exit because we have started some communications therefore we need some time
+
+  endif !!!!!!!!! end of iphase_CC 3
+
+  if(iphase_CC == 4) then
+
+#ifdef USE_MPI
+  if(ichunk == CHUNK_AB .or. ichunk == CHUNK_AB_ANTIPODE) then
+    do imsg = 1,nb_msgs_theor_in_cube-1
+      call MPI_TEST(request_send_array(imsg),flag_result_test,msg_status,ier)
+      if(.not. flag_result_test) return ! exit if message not sent yet
+    enddo
+  endif
+
+  if(ichunk /= CHUNK_AB .and. ichunk /= CHUNK_AB_ANTIPODE) then
+    call MPI_TEST(request_receive,flag_result_test,msg_status,ier)
+    if(.not. flag_result_test) return ! exit if message not received yet
+  endif
+#endif
+
+! receive info from central cube on all the slices except those in CHUNK_AB & CHUNK_AB_ANTIPODE
+  if(ichunk /= CHUNK_AB .and. ichunk /= CHUNK_AB_ANTIPODE) then
 ! for bottom elements in contact with central cube from the slices side
     ipoin = 0
     do ispec2D = 1,NSPEC2D_BOTTOM_INNER_CORE
-
       ispec = ibelm_bottom_inner_core(ispec2D)
-
 ! only for DOFs exactly on surface of central cube (bottom of these elements)
       k = 1
       do j = 1,NGLLY
         do i = 1,NGLLX
           ipoin = ipoin + 1
-
 ! distinguish between single and double precision for reals
           if(CUSTOM_REAL == SIZE_REAL) then
-            vector_assemble(1:ndim_assemble,ibool_inner_core(i,j,k,ispec)) = sngl(buffer_slices(ipoin,1:ndim_assemble))
+            vector_assemble(1:ndim_assemble,ibool_inner_core(i,j,k,ispec)) = sngl(buffer_slices(ipoin,:))
           else
-            vector_assemble(1:ndim_assemble,ibool_inner_core(i,j,k,ispec)) = buffer_slices(ipoin,1:ndim_assemble)
+            vector_assemble(1:ndim_assemble,ibool_inner_core(i,j,k,ispec)) = buffer_slices(ipoin,:)
           endif
-
         enddo
       enddo
     enddo
-
  endif  ! end receiving info from central cube
 
-!------- send info back from central cube to slices
+! this is the exit condition, to go beyond the last phase number
+  iphase_CC = iphase_CC + 1
 
-! on chunk AB & CHUNK_AB_ANTIPODE, send all the messages to slices
-  if(ichunk == CHUNK_AB .or. ichunk == CHUNK_AB_ANTIPODE) then
+  endif !!!!!!!!! end of iphase_CC 4
 
-   do imsg = 1,nb_msgs_theor_in_cube-1
+  end subroutine assemble_MPI_central_cube
 
-! copy buffer in 2D array for each slice
-   buffer_slices(:,1:ndim_assemble) = buffer_all_cube_from_slices(imsg,:,1:ndim_assemble)
-
-! send buffers to slices
-    receiver = sender_from_slices_to_cube(imsg)
-#ifdef USE_MPI
-    call MPI_SEND(buffer_slices,ndim_assemble*npoin2D_cube_from_slices, &
-              MPI_DOUBLE_PRECISION,receiver,itag,MPI_COMM_WORLD,ier)
-#endif
-
-   enddo
-   endif
-
-end subroutine assemble_MPI_central_cube
-

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/assemble_MPI_central_cube_block.F90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/assemble_MPI_central_cube_block.F90	2008-09-18 21:34:10 UTC (rev 12915)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/assemble_MPI_central_cube_block.F90	2008-09-19 10:17:21 UTC (rev 12916)
@@ -25,10 +25,10 @@
 !
 !=====================================================================
 
-subroutine assemble_MPI_central_cube_block(ichunk,nb_msgs_theor_in_cube, sender_from_slices_to_cube, &
-  npoin2D_cube_from_slices, buffer_all_cube_from_slices, buffer_slices, buffer_slices2, ibool_central_cube, &
-  receiver_cube_from_slices, ibool_inner_core, idoubling_inner_core, NSPEC_INNER_CORE, &
-  ibelm_bottom_inner_core, NSPEC2D_BOTTOM_INNER_CORE,NGLOB_INNER_CORE,vector_assemble,ndim_assemble)
+  subroutine assemble_MPI_central_cube_block(ichunk,nb_msgs_theor_in_cube,sender_from_slices_to_cube, &
+    npoin2D_cube_from_slices,buffer_all_cube_from_slices,buffer_slices,buffer_slices2,ibool_central_cube, &
+    receiver_cube_from_slices,ibool_inner_core,idoubling_inner_core,NSPEC_INNER_CORE, &
+    ibelm_bottom_inner_core,NSPEC2D_BOTTOM_INNER_CORE,NGLOB_INNER_CORE,vector_assemble,ndim_assemble)
 
   implicit none
 
@@ -42,7 +42,7 @@
   integer ichunk, nb_msgs_theor_in_cube, npoin2D_cube_from_slices
   integer, dimension(nb_msgs_theor_in_cube) :: sender_from_slices_to_cube
   double precision, dimension(npoin2D_cube_from_slices,NDIM) :: buffer_slices,buffer_slices2
-  double precision, dimension(nb_msgs_theor_in_cube,npoin2D_cube_from_slices,NDIM) :: buffer_all_cube_from_slices
+  double precision, dimension(npoin2D_cube_from_slices,NDIM,nb_msgs_theor_in_cube) :: buffer_all_cube_from_slices
   integer, dimension(nb_msgs_theor_in_cube,npoin2D_cube_from_slices):: ibool_central_cube
   integer receiver_cube_from_slices
 
@@ -89,7 +89,7 @@
 #endif
 
 ! copy buffer in 2D array for each slice
-    buffer_all_cube_from_slices(imsg,:,1:ndim_assemble) = buffer_slices(:,1:ndim_assemble)
+    buffer_all_cube_from_slices(:,1:ndim_assemble,imsg) = buffer_slices(:,1:ndim_assemble)
 
     enddo
   endif
@@ -147,7 +147,7 @@
         MPI_DOUBLE_PRECISION,sender,itag,MPI_COMM_WORLD,msg_status,ier)
 #endif
 
-   buffer_all_cube_from_slices(nb_msgs_theor_in_cube,:,1:ndim_assemble) = buffer_slices2(:,1:ndim_assemble)
+   buffer_all_cube_from_slices(:,1:ndim_assemble,nb_msgs_theor_in_cube) = buffer_slices2(:,1:ndim_assemble)
 
   endif
 
@@ -164,9 +164,9 @@
       do imsg = 1,nb_msgs_theor_in_cube-1
         do ipoin = 1,npoin2D_cube_from_slices
           if(CUSTOM_REAL == SIZE_REAL) then
-            array_central_cube(ibool_central_cube(imsg,ipoin)) = sngl(buffer_all_cube_from_slices(imsg,ipoin,idimension))
+            array_central_cube(ibool_central_cube(imsg,ipoin)) = sngl(buffer_all_cube_from_slices(ipoin,idimension,imsg))
           else
-            array_central_cube(ibool_central_cube(imsg,ipoin)) = buffer_all_cube_from_slices(imsg,ipoin,idimension)
+            array_central_cube(ibool_central_cube(imsg,ipoin)) = buffer_all_cube_from_slices(ipoin,idimension,imsg)
           endif
         enddo
       enddo
@@ -178,11 +178,11 @@
           if(CUSTOM_REAL == SIZE_REAL) then
             array_central_cube(ibool_central_cube(nb_msgs_theor_in_cube,ipoin)) = &
             array_central_cube(ibool_central_cube(nb_msgs_theor_in_cube,ipoin)) + &
-            sngl(buffer_all_cube_from_slices(nb_msgs_theor_in_cube,ipoin,idimension))
+            sngl(buffer_all_cube_from_slices(ipoin,idimension,nb_msgs_theor_in_cube))
           else
             array_central_cube(ibool_central_cube(nb_msgs_theor_in_cube,ipoin)) = &
             array_central_cube(ibool_central_cube(nb_msgs_theor_in_cube,ipoin)) + &
-            buffer_all_cube_from_slices(nb_msgs_theor_in_cube,ipoin,idimension)
+            buffer_all_cube_from_slices(ipoin,idimension,nb_msgs_theor_in_cube)
           endif
           mask(ibool_central_cube(nb_msgs_theor_in_cube,ipoin)) = .true.
         endif
@@ -206,7 +206,7 @@
 ! copy sum back
       do imsg = 1,nb_msgs_theor_in_cube-1
         do ipoin = 1,npoin2D_cube_from_slices
-          buffer_all_cube_from_slices(imsg,ipoin,idimension) = vector_assemble(idimension,ibool_central_cube(imsg,ipoin))
+          buffer_all_cube_from_slices(ipoin,idimension,imsg) = vector_assemble(idimension,ibool_central_cube(imsg,ipoin))
         enddo
       enddo
 
@@ -260,7 +260,7 @@
    do imsg = 1,nb_msgs_theor_in_cube-1
 
 ! copy buffer in 2D array for each slice
-   buffer_slices(:,1:ndim_assemble) = buffer_all_cube_from_slices(imsg,:,1:ndim_assemble)
+   buffer_slices(:,1:ndim_assemble) = buffer_all_cube_from_slices(:,1:ndim_assemble,imsg)
 
 ! send buffers to slices
     receiver = sender_from_slices_to_cube(imsg)
@@ -272,5 +272,5 @@
    enddo
    endif
 
-end subroutine assemble_MPI_central_cube_block
+  end subroutine assemble_MPI_central_cube_block
 

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/assemble_MPI_scalar.F90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/assemble_MPI_scalar.F90	2008-09-18 21:34:10 UTC (rev 12915)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/assemble_MPI_scalar.F90	2008-09-19 10:17:21 UTC (rev 12916)
@@ -34,11 +34,11 @@
             iboolleft_xi,iboolright_xi,iboolleft_eta,iboolright_eta, &
             npoin2D_faces,npoin2D_xi,npoin2D_eta, &
             iboolfaces,iboolcorner, &
-            iprocfrom_faces,iprocto_faces,imsg_type, &
+            iprocfrom_faces,iprocto_faces, &
             iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
-            buffer_send_faces_scalar,buffer_received_faces_scalar,npoin2D_max_all, &
+            buffer_send_faces_scalar,buffer_received_faces_scalar,npoin2D_real_size, &
             buffer_send_chunkcorners_scalar,buffer_recv_chunkcorners_scalar, &
-            NUMMSGS_FACES,NUM_MSG_TYPES,NCORNERSCHUNKS, &
+            NUMMSGS_FACES,NCORNERSCHUNKS, &
             NPROC_XI,NPROC_ETA,NGLOB1D_RADIAL, &
             NGLOB2DMAX_XMIN_XMAX,NGLOB2DMAX_YMIN_YMAX,NGLOB2DMAX_XY,NCHUNKS,iphase)
 
@@ -65,7 +65,7 @@
 
   integer NGLOB2DMAX_XMIN_XMAX,NGLOB2DMAX_YMIN_YMAX,NGLOB2DMAX_XY
   integer NPROC_XI,NPROC_ETA,NGLOB1D_RADIAL
-  integer NUMMSGS_FACES,NUM_MSG_TYPES,NCORNERSCHUNKS
+  integer NUMMSGS_FACES,NCORNERSCHUNKS
 
 ! for addressing of the slices
   integer, dimension(NCHUNKS,0:NPROC_XI-1,0:NPROC_ETA-1) :: addressing
@@ -78,9 +78,9 @@
   integer, dimension(NGLOB1D_RADIAL,NUMCORNERS_SHARED) :: iboolcorner
   integer icount_corners
 
-  integer :: npoin2D_max_all
+  integer :: npoin2D_real_size
   integer, dimension(NGLOB2DMAX_XY,NUMFACES_SHARED) :: iboolfaces
-  real(kind=CUSTOM_REAL), dimension(npoin2D_max_all) :: buffer_send_faces_scalar,buffer_received_faces_scalar
+  real(kind=CUSTOM_REAL), dimension(npoin2D_real_size,NUMFACES_SHARED) :: buffer_send_faces_scalar,buffer_received_faces_scalar
 
 ! buffers for send and receive between corners of the chunks
   real(kind=CUSTOM_REAL), dimension(NGLOB1D_RADIAL) :: buffer_send_chunkcorners_scalar,buffer_recv_chunkcorners_scalar
@@ -88,7 +88,7 @@
 ! ---- arrays to assemble between chunks
 
 ! communication pattern for faces between chunks
-  integer, dimension(NUMMSGS_FACES) :: iprocfrom_faces,iprocto_faces,imsg_type
+  integer, dimension(NUMMSGS_FACES) :: iprocfrom_faces,iprocto_faces
 
 ! communication pattern for corners between chunks
   integer, dimension(NCORNERSCHUNKS) :: iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners
@@ -100,19 +100,23 @@
 
   integer :: ipoin,ipoin2D,ipoin1D
   integer :: sender,receiver
-  integer :: imsg,imsg_loop
+  integer :: imsg
   integer :: icount_faces,npoin2D_chunks
 
 #ifdef USE_MPI
   integer :: ier
   integer, save :: request_send,request_receive
+  integer, dimension(NUMFACES_SHARED), save :: request_send_array,request_receive_array
   logical :: flag_result_test
 #endif
 
 ! $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
 
 ! check flag to see if we need to assemble (might be turned off when debugging)
-  if (.not. ACTUALLY_ASSEMBLE_MPI_SLICES) return
+  if (.not. ACTUALLY_ASSEMBLE_MPI_SLICES)  then
+    iphase = 9999 ! this means everything is finished
+    return
+  endif
 
 ! here we have to assemble all the contributions between slices using MPI
 
@@ -128,7 +132,7 @@
 
 ! slices copy the right face into the buffer
   do ipoin=1,npoin2D_xi(2)
-    buffer_send_faces_scalar(ipoin) = array_val(iboolright_xi(ipoin))
+    buffer_send_faces_scalar(ipoin,1) = array_val(iboolright_xi(ipoin))
   enddo
 
 ! send messages forward along each row
@@ -160,7 +164,7 @@
   call MPI_IRECV(buffer_received_faces_scalar,npoin2D_xi(1),CUSTOM_MPI_TYPE,sender, &
         itag,MPI_COMM_WORLD,request_receive,ier)
 
-  call MPI_ISEND(buffer_send_faces_scalar,npoin2D_xi(2),CUSTOM_MPI_TYPE,receiver, &
+  call MPI_ISSEND(buffer_send_faces_scalar,npoin2D_xi(2),CUSTOM_MPI_TYPE,receiver, &
         itag2,MPI_COMM_WORLD,request_send,ier)
 
 #endif
@@ -185,7 +189,7 @@
   if(iproc_xi > 0) then
   do ipoin=1,npoin2D_xi(1)
     array_val(iboolleft_xi(ipoin)) = array_val(iboolleft_xi(ipoin)) + &
-                              buffer_received_faces_scalar(ipoin)
+                              buffer_received_faces_scalar(ipoin,1)
   enddo
   endif
 
@@ -193,7 +197,7 @@
 ! now we have to send the result back to the sender
 ! all slices copy the left face into the buffer
   do ipoin=1,npoin2D_xi(1)
-    buffer_send_faces_scalar(ipoin) = array_val(iboolleft_xi(ipoin))
+    buffer_send_faces_scalar(ipoin,1) = array_val(iboolleft_xi(ipoin))
   enddo
 
 ! send messages backward along each row
@@ -225,7 +229,7 @@
   call MPI_IRECV(buffer_received_faces_scalar,npoin2D_xi(2),CUSTOM_MPI_TYPE,sender, &
         itag,MPI_COMM_WORLD,request_receive,ier)
 
-  call MPI_ISEND(buffer_send_faces_scalar,npoin2D_xi(1),CUSTOM_MPI_TYPE,receiver, &
+  call MPI_ISSEND(buffer_send_faces_scalar,npoin2D_xi(1),CUSTOM_MPI_TYPE,receiver, &
         itag2,MPI_COMM_WORLD,request_send,ier)
 
 #endif
@@ -249,7 +253,7 @@
 ! all slices copy the buffer received to the contributions on the right face
   if(iproc_xi < NPROC_XI-1) then
   do ipoin=1,npoin2D_xi(2)
-    array_val(iboolright_xi(ipoin)) = buffer_received_faces_scalar(ipoin)
+    array_val(iboolright_xi(ipoin)) = buffer_received_faces_scalar(ipoin,1)
   enddo
   endif
 
@@ -259,7 +263,7 @@
 
 ! slices copy the right face into the buffer
   do ipoin=1,npoin2D_eta(2)
-    buffer_send_faces_scalar(ipoin) = array_val(iboolright_eta(ipoin))
+    buffer_send_faces_scalar(ipoin,1) = array_val(iboolright_eta(ipoin))
   enddo
 
 ! send messages forward along each row
@@ -291,7 +295,7 @@
   call MPI_IRECV(buffer_received_faces_scalar,npoin2D_eta(1),CUSTOM_MPI_TYPE,sender, &
     itag,MPI_COMM_WORLD,request_receive,ier)
 
-  call MPI_ISEND(buffer_send_faces_scalar,npoin2D_eta(2),CUSTOM_MPI_TYPE,receiver, &
+  call MPI_ISSEND(buffer_send_faces_scalar,npoin2D_eta(2),CUSTOM_MPI_TYPE,receiver, &
     itag2,MPI_COMM_WORLD,request_send,ier)
 
 #endif
@@ -316,7 +320,7 @@
   if(iproc_eta > 0) then
   do ipoin=1,npoin2D_eta(1)
     array_val(iboolleft_eta(ipoin)) = array_val(iboolleft_eta(ipoin)) + &
-                              buffer_received_faces_scalar(ipoin)
+                              buffer_received_faces_scalar(ipoin,1)
   enddo
   endif
 
@@ -324,7 +328,7 @@
 ! now we have to send the result back to the sender
 ! all slices copy the left face into the buffer
   do ipoin=1,npoin2D_eta(1)
-    buffer_send_faces_scalar(ipoin) = array_val(iboolleft_eta(ipoin))
+    buffer_send_faces_scalar(ipoin,1) = array_val(iboolleft_eta(ipoin))
   enddo
 
 ! send messages backward along each row
@@ -356,7 +360,7 @@
   call MPI_IRECV(buffer_received_faces_scalar,npoin2D_eta(2),CUSTOM_MPI_TYPE,sender, &
     itag,MPI_COMM_WORLD,request_receive,ier)
 
-  call MPI_ISEND(buffer_send_faces_scalar,npoin2D_eta(1),CUSTOM_MPI_TYPE,receiver, &
+  call MPI_ISSEND(buffer_send_faces_scalar,npoin2D_eta(1),CUSTOM_MPI_TYPE,receiver, &
     itag2,MPI_COMM_WORLD,request_send,ier)
 
 #endif
@@ -380,14 +384,10 @@
 ! all slices copy the buffer received to the contributions on the right face
   if(iproc_eta < NPROC_ETA-1) then
   do ipoin=1,npoin2D_eta(2)
-    array_val(iboolright_eta(ipoin)) = buffer_received_faces_scalar(ipoin)
+    array_val(iboolright_eta(ipoin)) = buffer_received_faces_scalar(ipoin,1)
   enddo
   endif
 
-  iphase = iphase + 1
-
-!! DK DK do the rest in blocking for now, for simplicity
-
 !----
 !---- start MPI assembling phase between chunks
 !----
@@ -406,26 +406,21 @@
 !---- put slices in receive mode
 !---- a given slice can belong to at most two faces
 
-! use three step scheme that can never deadlock
-! scheme for faces cannot deadlock even if NPROC_XI = NPROC_ETA = 1
-  do imsg_loop = 1,NUM_MSG_TYPES
-
   icount_faces = 0
   do imsg = 1,NUMMSGS_FACES
-  if(myrank==iprocfrom_faces(imsg) .or. &
-       myrank==iprocto_faces(imsg)) icount_faces = icount_faces + 1
-  if(myrank==iprocto_faces(imsg) .and. imsg_type(imsg) == imsg_loop) then
+  if(myrank==iprocfrom_faces(imsg) .or. myrank==iprocto_faces(imsg)) icount_faces = icount_faces + 1
+  if(myrank==iprocto_faces(imsg)) then
     sender = iprocfrom_faces(imsg)
     npoin2D_chunks = npoin2D_faces(icount_faces)
 #ifdef USE_MPI
-    call MPI_RECV(buffer_received_faces_scalar, &
+    call MPI_IRECV(buffer_received_faces_scalar(:,icount_faces), &
               npoin2D_chunks,CUSTOM_MPI_TYPE,sender, &
-              itag,MPI_COMM_WORLD,msg_status,ier)
+              itag,MPI_COMM_WORLD,request_receive_array(icount_faces),ier)
 #endif
-    do ipoin2D=1,npoin2D_chunks
-      array_val(iboolfaces(ipoin2D,icount_faces)) = &
-         array_val(iboolfaces(ipoin2D,icount_faces)) + buffer_received_faces_scalar(ipoin2D)
-    enddo
+!   do ipoin2D=1,npoin2D_chunks
+!     array_val(iboolfaces(ipoin2D,icount_faces)) = &
+!        array_val(iboolfaces(ipoin2D,icount_faces)) + buffer_received_faces_scalar(ipoin2D)
+!   enddo
   endif
   enddo
 
@@ -433,21 +428,58 @@
 !---- a given slice can belong to at most two faces
   icount_faces = 0
   do imsg = 1,NUMMSGS_FACES
-  if(myrank==iprocfrom_faces(imsg) .or. &
-       myrank==iprocto_faces(imsg)) icount_faces = icount_faces + 1
-  if(myrank==iprocfrom_faces(imsg) .and. imsg_type(imsg) == imsg_loop) then
+  if(myrank==iprocfrom_faces(imsg) .or. myrank==iprocto_faces(imsg)) icount_faces = icount_faces + 1
+  if(myrank==iprocfrom_faces(imsg)) then
     receiver = iprocto_faces(imsg)
     npoin2D_chunks = npoin2D_faces(icount_faces)
     do ipoin2D=1,npoin2D_chunks
-      buffer_send_faces_scalar(ipoin2D) = array_val(iboolfaces(ipoin2D,icount_faces))
+      buffer_send_faces_scalar(ipoin2D,icount_faces) = array_val(iboolfaces(ipoin2D,icount_faces))
     enddo
 #ifdef USE_MPI
-    call MPI_SEND(buffer_send_faces_scalar,npoin2D_chunks, &
-              CUSTOM_MPI_TYPE,receiver,itag,MPI_COMM_WORLD,ier)
+    call MPI_ISSEND(buffer_send_faces_scalar(:,icount_faces),npoin2D_chunks, &
+              CUSTOM_MPI_TYPE,receiver,itag,MPI_COMM_WORLD,request_send_array(icount_faces),ier)
 #endif
   endif
   enddo
 
+  iphase = iphase + 1
+  return ! exit because we have started some communications therefore we need some time
+
+  endif !!!!!!!!! end of iphase 5
+
+  if(iphase == 6) then
+
+#ifdef USE_MPI
+  icount_faces = 0
+  do imsg = 1,NUMMSGS_FACES
+  if(myrank==iprocfrom_faces(imsg) .or. myrank==iprocto_faces(imsg)) icount_faces = icount_faces + 1
+  if(myrank==iprocto_faces(imsg)) then
+    call MPI_TEST(request_receive_array(icount_faces),flag_result_test,msg_status,ier)
+    if(.not. flag_result_test) return ! exit if message not received yet
+  endif
+  enddo
+
+  icount_faces = 0
+  do imsg = 1,NUMMSGS_FACES
+  if(myrank==iprocfrom_faces(imsg) .or. myrank==iprocto_faces(imsg)) icount_faces = icount_faces + 1
+  if(myrank==iprocfrom_faces(imsg)) then
+    call MPI_TEST(request_send_array(icount_faces),flag_result_test,msg_status,ier)
+    if(.not. flag_result_test) return ! exit if message not sent yet
+  endif
+  enddo
+#endif
+
+  icount_faces = 0
+  do imsg = 1,NUMMSGS_FACES
+  if(myrank==iprocfrom_faces(imsg) .or. myrank==iprocto_faces(imsg)) icount_faces = icount_faces + 1
+  if(myrank==iprocto_faces(imsg)) then
+    do ipoin2D=1,npoin2D_faces(icount_faces)
+      array_val(iboolfaces(ipoin2D,icount_faces)) = &
+         array_val(iboolfaces(ipoin2D,icount_faces)) + buffer_received_faces_scalar(ipoin2D,icount_faces)
+    enddo
+  endif
+  enddo
+
 ! *********************************************************************
 !  transmit messages back in opposite direction (iprocto -> iprocfrom)
 ! *********************************************************************
@@ -457,19 +489,18 @@
 
   icount_faces = 0
   do imsg = 1,NUMMSGS_FACES
-  if(myrank==iprocfrom_faces(imsg) .or. &
-       myrank==iprocto_faces(imsg)) icount_faces = icount_faces + 1
-  if(myrank==iprocfrom_faces(imsg) .and. imsg_type(imsg) == imsg_loop) then
+  if(myrank==iprocfrom_faces(imsg) .or. myrank==iprocto_faces(imsg)) icount_faces = icount_faces + 1
+  if(myrank==iprocfrom_faces(imsg)) then
     sender = iprocto_faces(imsg)
     npoin2D_chunks = npoin2D_faces(icount_faces)
 #ifdef USE_MPI
-    call MPI_RECV(buffer_received_faces_scalar, &
+    call MPI_IRECV(buffer_received_faces_scalar(:,icount_faces), &
               npoin2D_chunks,CUSTOM_MPI_TYPE,sender, &
-              itag,MPI_COMM_WORLD,msg_status,ier)
+              itag,MPI_COMM_WORLD,request_receive_array(icount_faces),ier)
 #endif
-    do ipoin2D=1,npoin2D_chunks
-      array_val(iboolfaces(ipoin2D,icount_faces)) = buffer_received_faces_scalar(ipoin2D)
-    enddo
+!   do ipoin2D=1,npoin2D_chunks
+!     array_val(iboolfaces(ipoin2D,icount_faces)) = buffer_received_faces_scalar(ipoin2D)
+!   enddo
   endif
   enddo
 
@@ -477,24 +508,62 @@
 !---- a given slice can belong to at most two faces
   icount_faces = 0
   do imsg = 1,NUMMSGS_FACES
-  if(myrank==iprocfrom_faces(imsg) .or. &
-       myrank==iprocto_faces(imsg)) icount_faces = icount_faces + 1
-  if(myrank==iprocto_faces(imsg) .and. imsg_type(imsg) == imsg_loop) then
+  if(myrank==iprocfrom_faces(imsg) .or. myrank==iprocto_faces(imsg)) icount_faces = icount_faces + 1
+  if(myrank==iprocto_faces(imsg)) then
     receiver = iprocfrom_faces(imsg)
     npoin2D_chunks = npoin2D_faces(icount_faces)
     do ipoin2D=1,npoin2D_chunks
-      buffer_send_faces_scalar(ipoin2D) = array_val(iboolfaces(ipoin2D,icount_faces))
+      buffer_send_faces_scalar(ipoin2D,icount_faces) = array_val(iboolfaces(ipoin2D,icount_faces))
     enddo
 #ifdef USE_MPI
-    call MPI_SEND(buffer_send_faces_scalar,npoin2D_chunks, &
-              CUSTOM_MPI_TYPE,receiver,itag,MPI_COMM_WORLD,ier)
+    call MPI_ISSEND(buffer_send_faces_scalar(:,icount_faces),npoin2D_chunks, &
+              CUSTOM_MPI_TYPE,receiver,itag,MPI_COMM_WORLD,request_send_array(icount_faces),ier)
 #endif
   endif
   enddo
 
-! end of anti-deadlocking loop
+  iphase = iphase + 1
+  return ! exit because we have started some communications therefore we need some time
+
+  endif !!!!!!!!! end of iphase 6
+
+  if(iphase == 7) then
+
+#ifdef USE_MPI
+  icount_faces = 0
+  do imsg = 1,NUMMSGS_FACES
+  if(myrank==iprocfrom_faces(imsg) .or. myrank==iprocto_faces(imsg)) icount_faces = icount_faces + 1
+  if(myrank==iprocto_faces(imsg)) then
+    call MPI_TEST(request_send_array(icount_faces),flag_result_test,msg_status,ier)
+    if(.not. flag_result_test) return ! exit if message not received yet
+  endif
   enddo
 
+  icount_faces = 0
+  do imsg = 1,NUMMSGS_FACES
+  if(myrank==iprocfrom_faces(imsg) .or. myrank==iprocto_faces(imsg)) icount_faces = icount_faces + 1
+  if(myrank==iprocfrom_faces(imsg)) then
+    call MPI_TEST(request_receive_array(icount_faces),flag_result_test,msg_status,ier)
+    if(.not. flag_result_test) return ! exit if message not sent yet
+  endif
+  enddo
+#endif
+
+  icount_faces = 0
+  do imsg = 1,NUMMSGS_FACES
+  if(myrank==iprocfrom_faces(imsg) .or. myrank==iprocto_faces(imsg)) icount_faces = icount_faces + 1
+  if(myrank==iprocfrom_faces(imsg)) then
+    do ipoin2D=1,npoin2D_faces(icount_faces)
+      array_val(iboolfaces(ipoin2D,icount_faces)) = buffer_received_faces_scalar(ipoin2D,icount_faces)
+    enddo
+  endif
+  enddo
+
+! this is the exit condition, to go beyond the last phase number
+  iphase = iphase + 1
+
+!! DK DK do the rest in blocking for now, for simplicity
+
 !----
 !---- start MPI assembling corners
 !----
@@ -604,7 +673,7 @@
 
   enddo
 
-  endif !!!!!!!!! end of iphase 5
+  endif !!!!!!!!! end of iphase 7
 
   end subroutine assemble_MPI_scalar
 

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/assemble_MPI_vector.F90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/assemble_MPI_vector.F90	2008-09-18 21:34:10 UTC (rev 12915)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/assemble_MPI_vector.F90	2008-09-19 10:17:21 UTC (rev 12916)
@@ -39,11 +39,11 @@
             iboolleft_xi_inner_core,iboolright_xi_inner_core,iboolleft_eta_inner_core,iboolright_eta_inner_core, &
             npoin2D_faces_inner_core,npoin2D_xi_inner_core,npoin2D_eta_inner_core, &
             iboolfaces_inner_core,iboolcorner_inner_core, &
-            iprocfrom_faces,iprocto_faces,imsg_type, &
+            iprocfrom_faces,iprocto_faces, &
             iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
-            buffer_send_faces_vector,buffer_received_faces_vector,npoin2D_max_all, &
+            buffer_send_faces_vector,buffer_received_faces_vector,npoin2D_real_size, &
             buffer_send_chunkcorners_vector,buffer_recv_chunkcorners_vector, &
-            NUMMSGS_FACES,NUM_MSG_TYPES,NCORNERSCHUNKS, &
+            NUMMSGS_FACES,NCORNERSCHUNKS, &
             NPROC_XI,NPROC_ETA,NGLOB1D_RADIAL_crust_mantle, &
             NGLOB1D_RADIAL_inner_core,NCHUNKS,iphase)
 
@@ -75,7 +75,7 @@
   integer npoin2D_faces_inner_core(NUMFACES_SHARED)
 
   integer NGLOB1D_RADIAL_crust_mantle,NGLOB1D_RADIAL_inner_core,NPROC_XI,NPROC_ETA
-  integer NUMMSGS_FACES,NUM_MSG_TYPES,NCORNERSCHUNKS
+  integer NUMMSGS_FACES,NCORNERSCHUNKS
 
 ! for addressing of the slices
   integer, dimension(NCHUNKS,0:NPROC_XI-1,0:NPROC_ETA-1) :: addressing
@@ -91,10 +91,10 @@
   integer, dimension(NGLOB1D_RADIAL_inner_core,NUMCORNERS_SHARED) :: iboolcorner_inner_core
   integer icount_corners
 
-  integer :: npoin2D_max_all
+  integer :: npoin2D_real_size
   integer, dimension(NGLOB2DMAX_XY_VAL_CM,NUMFACES_SHARED) :: iboolfaces_crust_mantle
   integer, dimension(NGLOB2DMAX_XY_VAL_IC,NUMFACES_SHARED) :: iboolfaces_inner_core
-  real(kind=CUSTOM_REAL), dimension(NDIM,npoin2D_max_all) :: buffer_send_faces_vector,buffer_received_faces_vector
+  real(kind=CUSTOM_REAL), dimension(NDIM,npoin2D_real_size,NUMFACES_SHARED) :: buffer_send_faces_vector,buffer_received_faces_vector
 
 ! buffers for send and receive between corners of the chunks
 ! size of buffers is the sum of two sizes because we handle two regions in the same MPI call
@@ -104,7 +104,7 @@
 ! ---- arrays to assemble between chunks
 
 ! communication pattern for faces between chunks
-  integer, dimension(NUMMSGS_FACES) :: iprocfrom_faces,iprocto_faces,imsg_type
+  integer, dimension(NUMMSGS_FACES) :: iprocfrom_faces,iprocto_faces
 
 ! communication pattern for corners between chunks
   integer, dimension(NCORNERSCHUNKS) :: iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners
@@ -116,7 +116,7 @@
 
   integer :: ipoin,ipoin2D,ipoin1D
   integer :: sender,receiver
-  integer :: imsg,imsg_loop
+  integer :: imsg
   integer :: icount_faces,npoin2D_chunks_all
 
   integer :: npoin2D_xi_all,npoin2D_eta_all,NGLOB1D_RADIAL_all
@@ -126,13 +126,17 @@
 #ifdef USE_MPI
   integer :: ier
   integer, save :: request_send,request_receive
+  integer, dimension(NUMFACES_SHARED), save :: request_send_array,request_receive_array
   logical :: flag_result_test
 #endif
 
 ! $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
 
 ! check flag to see if we need to assemble (might be turned off when debugging)
-  if (.not. ACTUALLY_ASSEMBLE_MPI_SLICES) return
+  if (.not. ACTUALLY_ASSEMBLE_MPI_SLICES) then
+    iphase = 9999 ! this means everything is finished
+    return
+  endif
 
 ! here we have to assemble all the contributions between slices using MPI
 
@@ -155,15 +159,15 @@
 
 ! slices copy the right face into the buffer
   do ipoin = 1,npoin2D_xi_crust_mantle
-    buffer_send_faces_vector(1,ipoin) = accel_crust_mantle(1,iboolright_xi_crust_mantle(ipoin))
-    buffer_send_faces_vector(2,ipoin) = accel_crust_mantle(2,iboolright_xi_crust_mantle(ipoin))
-    buffer_send_faces_vector(3,ipoin) = accel_crust_mantle(3,iboolright_xi_crust_mantle(ipoin))
+    buffer_send_faces_vector(1,ipoin,1) = accel_crust_mantle(1,iboolright_xi_crust_mantle(ipoin))
+    buffer_send_faces_vector(2,ipoin,1) = accel_crust_mantle(2,iboolright_xi_crust_mantle(ipoin))
+    buffer_send_faces_vector(3,ipoin,1) = accel_crust_mantle(3,iboolright_xi_crust_mantle(ipoin))
   enddo
 
   do ipoin = 1,npoin2D_xi_inner_core
-    buffer_send_faces_vector(1,ioffset + ipoin) = accel_inner_core(1,iboolright_xi_inner_core(ipoin))
-    buffer_send_faces_vector(2,ioffset + ipoin) = accel_inner_core(2,iboolright_xi_inner_core(ipoin))
-    buffer_send_faces_vector(3,ioffset + ipoin) = accel_inner_core(3,iboolright_xi_inner_core(ipoin))
+    buffer_send_faces_vector(1,ioffset + ipoin,1) = accel_inner_core(1,iboolright_xi_inner_core(ipoin))
+    buffer_send_faces_vector(2,ioffset + ipoin,1) = accel_inner_core(2,iboolright_xi_inner_core(ipoin))
+    buffer_send_faces_vector(3,ioffset + ipoin,1) = accel_inner_core(3,iboolright_xi_inner_core(ipoin))
   enddo
 
 ! send messages forward along each row
@@ -195,7 +199,7 @@
   call MPI_IRECV(buffer_received_faces_vector,NDIM*npoin2D_xi_all,CUSTOM_MPI_TYPE,sender, &
         itag,MPI_COMM_WORLD,request_receive,ier)
 
-  call MPI_ISEND(buffer_send_faces_vector,NDIM*npoin2D_xi_all,CUSTOM_MPI_TYPE,receiver, &
+  call MPI_ISSEND(buffer_send_faces_vector,NDIM*npoin2D_xi_all,CUSTOM_MPI_TYPE,receiver, &
         itag2,MPI_COMM_WORLD,request_send,ier)
 
 #endif
@@ -221,20 +225,20 @@
 
   do ipoin = 1,npoin2D_xi_crust_mantle
     accel_crust_mantle(1,iboolleft_xi_crust_mantle(ipoin)) = accel_crust_mantle(1,iboolleft_xi_crust_mantle(ipoin)) + &
-                              buffer_received_faces_vector(1,ipoin)
+                              buffer_received_faces_vector(1,ipoin,1)
     accel_crust_mantle(2,iboolleft_xi_crust_mantle(ipoin)) = accel_crust_mantle(2,iboolleft_xi_crust_mantle(ipoin)) + &
-                              buffer_received_faces_vector(2,ipoin)
+                              buffer_received_faces_vector(2,ipoin,1)
     accel_crust_mantle(3,iboolleft_xi_crust_mantle(ipoin)) = accel_crust_mantle(3,iboolleft_xi_crust_mantle(ipoin)) + &
-                              buffer_received_faces_vector(3,ipoin)
+                              buffer_received_faces_vector(3,ipoin,1)
   enddo
 
   do ipoin = 1,npoin2D_xi_inner_core
     accel_inner_core(1,iboolleft_xi_inner_core(ipoin)) = accel_inner_core(1,iboolleft_xi_inner_core(ipoin)) + &
-                              buffer_received_faces_vector(1,ioffset + ipoin)
+                              buffer_received_faces_vector(1,ioffset + ipoin,1)
     accel_inner_core(2,iboolleft_xi_inner_core(ipoin)) = accel_inner_core(2,iboolleft_xi_inner_core(ipoin)) + &
-                              buffer_received_faces_vector(2,ioffset + ipoin)
+                              buffer_received_faces_vector(2,ioffset + ipoin,1)
     accel_inner_core(3,iboolleft_xi_inner_core(ipoin)) = accel_inner_core(3,iboolleft_xi_inner_core(ipoin)) + &
-                              buffer_received_faces_vector(3,ioffset + ipoin)
+                              buffer_received_faces_vector(3,ioffset + ipoin,1)
   enddo
 
   endif
@@ -243,15 +247,15 @@
 ! now we have to send the result back to the sender
 ! all slices copy the left face into the buffer
   do ipoin = 1,npoin2D_xi_crust_mantle
-    buffer_send_faces_vector(1,ipoin) = accel_crust_mantle(1,iboolleft_xi_crust_mantle(ipoin))
-    buffer_send_faces_vector(2,ipoin) = accel_crust_mantle(2,iboolleft_xi_crust_mantle(ipoin))
-    buffer_send_faces_vector(3,ipoin) = accel_crust_mantle(3,iboolleft_xi_crust_mantle(ipoin))
+    buffer_send_faces_vector(1,ipoin,1) = accel_crust_mantle(1,iboolleft_xi_crust_mantle(ipoin))
+    buffer_send_faces_vector(2,ipoin,1) = accel_crust_mantle(2,iboolleft_xi_crust_mantle(ipoin))
+    buffer_send_faces_vector(3,ipoin,1) = accel_crust_mantle(3,iboolleft_xi_crust_mantle(ipoin))
   enddo
 
   do ipoin = 1,npoin2D_xi_inner_core
-    buffer_send_faces_vector(1,ioffset + ipoin) = accel_inner_core(1,iboolleft_xi_inner_core(ipoin))
-    buffer_send_faces_vector(2,ioffset + ipoin) = accel_inner_core(2,iboolleft_xi_inner_core(ipoin))
-    buffer_send_faces_vector(3,ioffset + ipoin) = accel_inner_core(3,iboolleft_xi_inner_core(ipoin))
+    buffer_send_faces_vector(1,ioffset + ipoin,1) = accel_inner_core(1,iboolleft_xi_inner_core(ipoin))
+    buffer_send_faces_vector(2,ioffset + ipoin,1) = accel_inner_core(2,iboolleft_xi_inner_core(ipoin))
+    buffer_send_faces_vector(3,ioffset + ipoin,1) = accel_inner_core(3,iboolleft_xi_inner_core(ipoin))
   enddo
 
 ! send messages backward along each row
@@ -283,9 +287,8 @@
   call MPI_IRECV(buffer_received_faces_vector,NDIM*npoin2D_xi_all,CUSTOM_MPI_TYPE,sender, &
         itag,MPI_COMM_WORLD,request_receive,ier)
 
-  call MPI_ISEND(buffer_send_faces_vector,NDIM*npoin2D_xi_all,CUSTOM_MPI_TYPE,receiver, &
+  call MPI_ISSEND(buffer_send_faces_vector,NDIM*npoin2D_xi_all,CUSTOM_MPI_TYPE,receiver, &
         itag2,MPI_COMM_WORLD,request_send,ier)
-
 #endif
 
   iphase = iphase + 1
@@ -308,15 +311,15 @@
   if(iproc_xi < NPROC_XI-1) then
 
   do ipoin = 1,npoin2D_xi_crust_mantle
-    accel_crust_mantle(1,iboolright_xi_crust_mantle(ipoin)) = buffer_received_faces_vector(1,ipoin)
-    accel_crust_mantle(2,iboolright_xi_crust_mantle(ipoin)) = buffer_received_faces_vector(2,ipoin)
-    accel_crust_mantle(3,iboolright_xi_crust_mantle(ipoin)) = buffer_received_faces_vector(3,ipoin)
+    accel_crust_mantle(1,iboolright_xi_crust_mantle(ipoin)) = buffer_received_faces_vector(1,ipoin,1)
+    accel_crust_mantle(2,iboolright_xi_crust_mantle(ipoin)) = buffer_received_faces_vector(2,ipoin,1)
+    accel_crust_mantle(3,iboolright_xi_crust_mantle(ipoin)) = buffer_received_faces_vector(3,ipoin,1)
   enddo
 
   do ipoin = 1,npoin2D_xi_inner_core
-    accel_inner_core(1,iboolright_xi_inner_core(ipoin)) = buffer_received_faces_vector(1,ioffset + ipoin)
-    accel_inner_core(2,iboolright_xi_inner_core(ipoin)) = buffer_received_faces_vector(2,ioffset + ipoin)
-    accel_inner_core(3,iboolright_xi_inner_core(ipoin)) = buffer_received_faces_vector(3,ioffset + ipoin)
+    accel_inner_core(1,iboolright_xi_inner_core(ipoin)) = buffer_received_faces_vector(1,ioffset + ipoin,1)
+    accel_inner_core(2,iboolright_xi_inner_core(ipoin)) = buffer_received_faces_vector(2,ioffset + ipoin,1)
+    accel_inner_core(3,iboolright_xi_inner_core(ipoin)) = buffer_received_faces_vector(3,ioffset + ipoin,1)
   enddo
 
   endif
@@ -330,15 +333,15 @@
 
 ! slices copy the right face into the buffer
   do ipoin = 1,npoin2D_eta_crust_mantle
-    buffer_send_faces_vector(1,ipoin) = accel_crust_mantle(1,iboolright_eta_crust_mantle(ipoin))
-    buffer_send_faces_vector(2,ipoin) = accel_crust_mantle(2,iboolright_eta_crust_mantle(ipoin))
-    buffer_send_faces_vector(3,ipoin) = accel_crust_mantle(3,iboolright_eta_crust_mantle(ipoin))
+    buffer_send_faces_vector(1,ipoin,1) = accel_crust_mantle(1,iboolright_eta_crust_mantle(ipoin))
+    buffer_send_faces_vector(2,ipoin,1) = accel_crust_mantle(2,iboolright_eta_crust_mantle(ipoin))
+    buffer_send_faces_vector(3,ipoin,1) = accel_crust_mantle(3,iboolright_eta_crust_mantle(ipoin))
   enddo
 
   do ipoin = 1,npoin2D_eta_inner_core
-    buffer_send_faces_vector(1,ioffset + ipoin) = accel_inner_core(1,iboolright_eta_inner_core(ipoin))
-    buffer_send_faces_vector(2,ioffset + ipoin) = accel_inner_core(2,iboolright_eta_inner_core(ipoin))
-    buffer_send_faces_vector(3,ioffset + ipoin) = accel_inner_core(3,iboolright_eta_inner_core(ipoin))
+    buffer_send_faces_vector(1,ioffset + ipoin,1) = accel_inner_core(1,iboolright_eta_inner_core(ipoin))
+    buffer_send_faces_vector(2,ioffset + ipoin,1) = accel_inner_core(2,iboolright_eta_inner_core(ipoin))
+    buffer_send_faces_vector(3,ioffset + ipoin,1) = accel_inner_core(3,iboolright_eta_inner_core(ipoin))
   enddo
 
 ! send messages forward along each row
@@ -370,9 +373,8 @@
   call MPI_IRECV(buffer_received_faces_vector,NDIM*npoin2D_eta_all,CUSTOM_MPI_TYPE,sender, &
     itag,MPI_COMM_WORLD,request_receive,ier)
 
-  call MPI_ISEND(buffer_send_faces_vector,NDIM*npoin2D_eta_all,CUSTOM_MPI_TYPE,receiver, &
+  call MPI_ISSEND(buffer_send_faces_vector,NDIM*npoin2D_eta_all,CUSTOM_MPI_TYPE,receiver, &
     itag2,MPI_COMM_WORLD,request_send,ier)
-
 #endif
 
   iphase = iphase + 1
@@ -396,20 +398,20 @@
 
   do ipoin = 1,npoin2D_eta_crust_mantle
     accel_crust_mantle(1,iboolleft_eta_crust_mantle(ipoin)) = accel_crust_mantle(1,iboolleft_eta_crust_mantle(ipoin)) + &
-                              buffer_received_faces_vector(1,ipoin)
+                              buffer_received_faces_vector(1,ipoin,1)
     accel_crust_mantle(2,iboolleft_eta_crust_mantle(ipoin)) = accel_crust_mantle(2,iboolleft_eta_crust_mantle(ipoin)) + &
-                              buffer_received_faces_vector(2,ipoin)
+                              buffer_received_faces_vector(2,ipoin,1)
     accel_crust_mantle(3,iboolleft_eta_crust_mantle(ipoin)) = accel_crust_mantle(3,iboolleft_eta_crust_mantle(ipoin)) + &
-                              buffer_received_faces_vector(3,ipoin)
+                              buffer_received_faces_vector(3,ipoin,1)
   enddo
 
   do ipoin = 1,npoin2D_eta_inner_core
     accel_inner_core(1,iboolleft_eta_inner_core(ipoin)) = accel_inner_core(1,iboolleft_eta_inner_core(ipoin)) + &
-                              buffer_received_faces_vector(1,ioffset + ipoin)
+                              buffer_received_faces_vector(1,ioffset + ipoin,1)
     accel_inner_core(2,iboolleft_eta_inner_core(ipoin)) = accel_inner_core(2,iboolleft_eta_inner_core(ipoin)) + &
-                              buffer_received_faces_vector(2,ioffset + ipoin)
+                              buffer_received_faces_vector(2,ioffset + ipoin,1)
     accel_inner_core(3,iboolleft_eta_inner_core(ipoin)) = accel_inner_core(3,iboolleft_eta_inner_core(ipoin)) + &
-                              buffer_received_faces_vector(3,ioffset + ipoin)
+                              buffer_received_faces_vector(3,ioffset + ipoin,1)
   enddo
 
   endif
@@ -418,15 +420,15 @@
 ! now we have to send the result back to the sender
 ! all slices copy the left face into the buffer
   do ipoin = 1,npoin2D_eta_crust_mantle
-    buffer_send_faces_vector(1,ipoin) = accel_crust_mantle(1,iboolleft_eta_crust_mantle(ipoin))
-    buffer_send_faces_vector(2,ipoin) = accel_crust_mantle(2,iboolleft_eta_crust_mantle(ipoin))
-    buffer_send_faces_vector(3,ipoin) = accel_crust_mantle(3,iboolleft_eta_crust_mantle(ipoin))
+    buffer_send_faces_vector(1,ipoin,1) = accel_crust_mantle(1,iboolleft_eta_crust_mantle(ipoin))
+    buffer_send_faces_vector(2,ipoin,1) = accel_crust_mantle(2,iboolleft_eta_crust_mantle(ipoin))
+    buffer_send_faces_vector(3,ipoin,1) = accel_crust_mantle(3,iboolleft_eta_crust_mantle(ipoin))
   enddo
 
   do ipoin = 1,npoin2D_eta_inner_core
-    buffer_send_faces_vector(1,ioffset + ipoin) = accel_inner_core(1,iboolleft_eta_inner_core(ipoin))
-    buffer_send_faces_vector(2,ioffset + ipoin) = accel_inner_core(2,iboolleft_eta_inner_core(ipoin))
-    buffer_send_faces_vector(3,ioffset + ipoin) = accel_inner_core(3,iboolleft_eta_inner_core(ipoin))
+    buffer_send_faces_vector(1,ioffset + ipoin,1) = accel_inner_core(1,iboolleft_eta_inner_core(ipoin))
+    buffer_send_faces_vector(2,ioffset + ipoin,1) = accel_inner_core(2,iboolleft_eta_inner_core(ipoin))
+    buffer_send_faces_vector(3,ioffset + ipoin,1) = accel_inner_core(3,iboolleft_eta_inner_core(ipoin))
   enddo
 
 ! send messages backward along each row
@@ -458,9 +460,8 @@
   call MPI_IRECV(buffer_received_faces_vector,NDIM*npoin2D_eta_all,CUSTOM_MPI_TYPE,sender, &
     itag,MPI_COMM_WORLD,request_receive,ier)
 
-  call MPI_ISEND(buffer_send_faces_vector,NDIM*npoin2D_eta_all,CUSTOM_MPI_TYPE,receiver, &
+  call MPI_ISSEND(buffer_send_faces_vector,NDIM*npoin2D_eta_all,CUSTOM_MPI_TYPE,receiver, &
     itag2,MPI_COMM_WORLD,request_send,ier)
-
 #endif
 
   iphase = iphase + 1
@@ -483,23 +484,19 @@
   if(iproc_eta < NPROC_ETA-1) then
 
   do ipoin = 1,npoin2D_eta_crust_mantle
-    accel_crust_mantle(1,iboolright_eta_crust_mantle(ipoin)) = buffer_received_faces_vector(1,ipoin)
-    accel_crust_mantle(2,iboolright_eta_crust_mantle(ipoin)) = buffer_received_faces_vector(2,ipoin)
-    accel_crust_mantle(3,iboolright_eta_crust_mantle(ipoin)) = buffer_received_faces_vector(3,ipoin)
+    accel_crust_mantle(1,iboolright_eta_crust_mantle(ipoin)) = buffer_received_faces_vector(1,ipoin,1)
+    accel_crust_mantle(2,iboolright_eta_crust_mantle(ipoin)) = buffer_received_faces_vector(2,ipoin,1)
+    accel_crust_mantle(3,iboolright_eta_crust_mantle(ipoin)) = buffer_received_faces_vector(3,ipoin,1)
   enddo
 
   do ipoin = 1,npoin2D_eta_inner_core
-    accel_inner_core(1,iboolright_eta_inner_core(ipoin)) = buffer_received_faces_vector(1,ioffset + ipoin)
-    accel_inner_core(2,iboolright_eta_inner_core(ipoin)) = buffer_received_faces_vector(2,ioffset + ipoin)
-    accel_inner_core(3,iboolright_eta_inner_core(ipoin)) = buffer_received_faces_vector(3,ioffset + ipoin)
+    accel_inner_core(1,iboolright_eta_inner_core(ipoin)) = buffer_received_faces_vector(1,ioffset + ipoin,1)
+    accel_inner_core(2,iboolright_eta_inner_core(ipoin)) = buffer_received_faces_vector(2,ioffset + ipoin,1)
+    accel_inner_core(3,iboolright_eta_inner_core(ipoin)) = buffer_received_faces_vector(3,ioffset + ipoin,1)
   enddo
 
   endif
 
-  iphase = iphase + 1
-
-!! DK DK do the rest in blocking for now, for simplicity
-
 !----
 !---- start MPI assembling phase between chunks
 !----
@@ -518,46 +515,44 @@
 !---- put slices in receive mode
 !---- a given slice can belong to at most two faces
 
-! use three step scheme that can never deadlock
-! scheme for faces cannot deadlock even if NPROC_XI = NPROC_ETA = 1
-  do imsg_loop = 1,NUM_MSG_TYPES
-
   icount_faces = 0
   do imsg = 1,NUMMSGS_FACES
-  if(myrank==iprocfrom_faces(imsg) .or. &
-       myrank==iprocto_faces(imsg)) icount_faces = icount_faces + 1
-  if(myrank==iprocto_faces(imsg) .and. imsg_type(imsg) == imsg_loop) then
+  if(myrank==iprocfrom_faces(imsg) .or. myrank==iprocto_faces(imsg)) icount_faces = icount_faces + 1
+  if(myrank==iprocto_faces(imsg)) then
     sender = iprocfrom_faces(imsg)
 
 ! size of buffers is the sum of two sizes because we handle two regions in the same MPI call
     npoin2D_chunks_all = npoin2D_faces_crust_mantle(icount_faces) + npoin2D_faces_inner_core(icount_faces)
 
-! the buffer for the inner core starts right after the buffer for the crust and mantle
-    ioffset = npoin2D_faces_crust_mantle(icount_faces)
-
 #ifdef USE_MPI
-    call MPI_RECV(buffer_received_faces_vector,NDIM*npoin2D_chunks_all,CUSTOM_MPI_TYPE,sender, &
-              itag,MPI_COMM_WORLD,msg_status,ier)
+    call MPI_IRECV(buffer_received_faces_vector(:,:,icount_faces),NDIM*npoin2D_chunks_all,CUSTOM_MPI_TYPE,sender, &
+              itag,MPI_COMM_WORLD,request_receive_array(icount_faces),ier)
 #endif
 
-    do ipoin2D = 1,npoin2D_faces_crust_mantle(icount_faces)
-      accel_crust_mantle(1,iboolfaces_crust_mantle(ipoin2D,icount_faces)) = &
-         accel_crust_mantle(1,iboolfaces_crust_mantle(ipoin2D,icount_faces)) + buffer_received_faces_vector(1,ipoin2D)
-      accel_crust_mantle(2,iboolfaces_crust_mantle(ipoin2D,icount_faces)) = &
-         accel_crust_mantle(2,iboolfaces_crust_mantle(ipoin2D,icount_faces)) + buffer_received_faces_vector(2,ipoin2D)
-      accel_crust_mantle(3,iboolfaces_crust_mantle(ipoin2D,icount_faces)) = &
-         accel_crust_mantle(3,iboolfaces_crust_mantle(ipoin2D,icount_faces)) + buffer_received_faces_vector(3,ipoin2D)
-    enddo
+!   do ipoin2D = 1,npoin2D_faces_crust_mantle(icount_faces)
+!     accel_crust_mantle(1,iboolfaces_crust_mantle(ipoin2D,icount_faces)) = &
+!        accel_crust_mantle(1,iboolfaces_crust_mantle(ipoin2D,icount_faces)) + buffer_received_faces_vector(1,ipoin2D,icount_faces)
+!     accel_crust_mantle(2,iboolfaces_crust_mantle(ipoin2D,icount_faces)) = &
+!        accel_crust_mantle(2,iboolfaces_crust_mantle(ipoin2D,icount_faces)) + buffer_received_faces_vector(2,ipoin2D,icount_faces)
+!     accel_crust_mantle(3,iboolfaces_crust_mantle(ipoin2D,icount_faces)) = &
+!        accel_crust_mantle(3,iboolfaces_crust_mantle(ipoin2D,icount_faces)) + buffer_received_faces_vector(3,ipoin2D,icount_faces)
+!   enddo
 
-    do ipoin2D = 1,npoin2D_faces_inner_core(icount_faces)
-      accel_inner_core(1,iboolfaces_inner_core(ipoin2D,icount_faces)) = &
-         accel_inner_core(1,iboolfaces_inner_core(ipoin2D,icount_faces)) + buffer_received_faces_vector(1,ioffset + ipoin2D)
-      accel_inner_core(2,iboolfaces_inner_core(ipoin2D,icount_faces)) = &
-         accel_inner_core(2,iboolfaces_inner_core(ipoin2D,icount_faces)) + buffer_received_faces_vector(2,ioffset + ipoin2D)
-      accel_inner_core(3,iboolfaces_inner_core(ipoin2D,icount_faces)) = &
-         accel_inner_core(3,iboolfaces_inner_core(ipoin2D,icount_faces)) + buffer_received_faces_vector(3,ioffset + ipoin2D)
-    enddo
+! the buffer for the inner core starts right after the buffer for the crust and mantle
+!   ioffset = npoin2D_faces_crust_mantle(icount_faces)
 
+!   do ipoin2D = 1,npoin2D_faces_inner_core(icount_faces)
+!     accel_inner_core(1,iboolfaces_inner_core(ipoin2D,icount_faces)) = &
+!        accel_inner_core(1,iboolfaces_inner_core(ipoin2D,icount_faces)) + &
+!          buffer_received_faces_vector(1,ioffset + ipoin2D,icount_faces)
+!     accel_inner_core(2,iboolfaces_inner_core(ipoin2D,icount_faces)) = &
+!        accel_inner_core(2,iboolfaces_inner_core(ipoin2D,icount_faces)) + &
+!          buffer_received_faces_vector(2,ioffset + ipoin2D,icount_faces)
+!     accel_inner_core(3,iboolfaces_inner_core(ipoin2D,icount_faces)) = &
+!        accel_inner_core(3,iboolfaces_inner_core(ipoin2D,icount_faces)) + &
+!          buffer_received_faces_vector(3,ioffset + ipoin2D,icount_faces)
+!   enddo
+
   endif
   enddo
 
@@ -565,37 +560,94 @@
 !---- a given slice can belong to at most two faces
   icount_faces = 0
   do imsg = 1,NUMMSGS_FACES
-  if(myrank==iprocfrom_faces(imsg) .or. &
-       myrank==iprocto_faces(imsg)) icount_faces = icount_faces + 1
-  if(myrank==iprocfrom_faces(imsg) .and. imsg_type(imsg) == imsg_loop) then
+  if(myrank==iprocfrom_faces(imsg) .or. myrank==iprocto_faces(imsg)) icount_faces = icount_faces + 1
+  if(myrank==iprocfrom_faces(imsg)) then
     receiver = iprocto_faces(imsg)
 
 ! size of buffers is the sum of two sizes because we handle two regions in the same MPI call
     npoin2D_chunks_all = npoin2D_faces_crust_mantle(icount_faces) + npoin2D_faces_inner_core(icount_faces)
 
+    do ipoin2D = 1,npoin2D_faces_crust_mantle(icount_faces)
+      buffer_send_faces_vector(1,ipoin2D,icount_faces) = accel_crust_mantle(1,iboolfaces_crust_mantle(ipoin2D,icount_faces))
+      buffer_send_faces_vector(2,ipoin2D,icount_faces) = accel_crust_mantle(2,iboolfaces_crust_mantle(ipoin2D,icount_faces))
+      buffer_send_faces_vector(3,ipoin2D,icount_faces) = accel_crust_mantle(3,iboolfaces_crust_mantle(ipoin2D,icount_faces))
+    enddo
+
 ! the buffer for the inner core starts right after the buffer for the crust and mantle
     ioffset = npoin2D_faces_crust_mantle(icount_faces)
 
-    do ipoin2D = 1,npoin2D_faces_crust_mantle(icount_faces)
-      buffer_send_faces_vector(1,ipoin2D) = accel_crust_mantle(1,iboolfaces_crust_mantle(ipoin2D,icount_faces))
-      buffer_send_faces_vector(2,ipoin2D) = accel_crust_mantle(2,iboolfaces_crust_mantle(ipoin2D,icount_faces))
-      buffer_send_faces_vector(3,ipoin2D) = accel_crust_mantle(3,iboolfaces_crust_mantle(ipoin2D,icount_faces))
-    enddo
-
     do ipoin2D = 1,npoin2D_faces_inner_core(icount_faces)
-      buffer_send_faces_vector(1,ioffset + ipoin2D) = accel_inner_core(1,iboolfaces_inner_core(ipoin2D,icount_faces))
-      buffer_send_faces_vector(2,ioffset + ipoin2D) = accel_inner_core(2,iboolfaces_inner_core(ipoin2D,icount_faces))
-      buffer_send_faces_vector(3,ioffset + ipoin2D) = accel_inner_core(3,iboolfaces_inner_core(ipoin2D,icount_faces))
+      buffer_send_faces_vector(1,ioffset + ipoin2D,icount_faces) = accel_inner_core(1,iboolfaces_inner_core(ipoin2D,icount_faces))
+      buffer_send_faces_vector(2,ioffset + ipoin2D,icount_faces) = accel_inner_core(2,iboolfaces_inner_core(ipoin2D,icount_faces))
+      buffer_send_faces_vector(3,ioffset + ipoin2D,icount_faces) = accel_inner_core(3,iboolfaces_inner_core(ipoin2D,icount_faces))
     enddo
 
 #ifdef USE_MPI
-    call MPI_SEND(buffer_send_faces_vector,NDIM*npoin2D_chunks_all,CUSTOM_MPI_TYPE,receiver,itag,MPI_COMM_WORLD,ier)
+    call MPI_ISSEND(buffer_send_faces_vector(:,:,icount_faces),NDIM*npoin2D_chunks_all,CUSTOM_MPI_TYPE,receiver,itag, &
+                     MPI_COMM_WORLD,request_send_array(icount_faces),ier)
 #endif
+  endif
+  enddo
 
+  iphase = iphase + 1
+  return ! exit because we have started some communications therefore we need some time
+
+  endif !!!!!!!!! end of iphase 5
+
+  if(iphase == 6) then
+
+#ifdef USE_MPI
+  icount_faces = 0
+  do imsg = 1,NUMMSGS_FACES
+  if(myrank==iprocfrom_faces(imsg) .or. myrank==iprocto_faces(imsg)) icount_faces = icount_faces + 1
+  if(myrank==iprocto_faces(imsg)) then
+    call MPI_TEST(request_receive_array(icount_faces),flag_result_test,msg_status,ier)
+    if(.not. flag_result_test) return ! exit if message not received yet
   endif
   enddo
 
+  icount_faces = 0
+  do imsg = 1,NUMMSGS_FACES
+  if(myrank==iprocfrom_faces(imsg) .or. myrank==iprocto_faces(imsg)) icount_faces = icount_faces + 1
+  if(myrank==iprocfrom_faces(imsg)) then
+    call MPI_TEST(request_send_array(icount_faces),flag_result_test,msg_status,ier)
+    if(.not. flag_result_test) return ! exit if message not sent yet
+  endif
+  enddo
+#endif
 
+  icount_faces = 0
+  do imsg = 1,NUMMSGS_FACES
+  if(myrank==iprocfrom_faces(imsg) .or. myrank==iprocto_faces(imsg)) icount_faces = icount_faces + 1
+  if(myrank==iprocto_faces(imsg)) then
+
+    do ipoin2D = 1,npoin2D_faces_crust_mantle(icount_faces)
+      accel_crust_mantle(1,iboolfaces_crust_mantle(ipoin2D,icount_faces)) = &
+         accel_crust_mantle(1,iboolfaces_crust_mantle(ipoin2D,icount_faces)) + buffer_received_faces_vector(1,ipoin2D,icount_faces)
+      accel_crust_mantle(2,iboolfaces_crust_mantle(ipoin2D,icount_faces)) = &
+         accel_crust_mantle(2,iboolfaces_crust_mantle(ipoin2D,icount_faces)) + buffer_received_faces_vector(2,ipoin2D,icount_faces)
+      accel_crust_mantle(3,iboolfaces_crust_mantle(ipoin2D,icount_faces)) = &
+         accel_crust_mantle(3,iboolfaces_crust_mantle(ipoin2D,icount_faces)) + buffer_received_faces_vector(3,ipoin2D,icount_faces)
+    enddo
+
+! the buffer for the inner core starts right after the buffer for the crust and mantle
+    ioffset = npoin2D_faces_crust_mantle(icount_faces)
+
+    do ipoin2D = 1,npoin2D_faces_inner_core(icount_faces)
+      accel_inner_core(1,iboolfaces_inner_core(ipoin2D,icount_faces)) = &
+         accel_inner_core(1,iboolfaces_inner_core(ipoin2D,icount_faces)) + &
+           buffer_received_faces_vector(1,ioffset + ipoin2D,icount_faces)
+      accel_inner_core(2,iboolfaces_inner_core(ipoin2D,icount_faces)) = &
+         accel_inner_core(2,iboolfaces_inner_core(ipoin2D,icount_faces)) + &
+           buffer_received_faces_vector(2,ioffset + ipoin2D,icount_faces)
+      accel_inner_core(3,iboolfaces_inner_core(ipoin2D,icount_faces)) = &
+         accel_inner_core(3,iboolfaces_inner_core(ipoin2D,icount_faces)) + &
+           buffer_received_faces_vector(3,ioffset + ipoin2D,icount_faces)
+    enddo
+
+  endif
+  enddo
+
 ! *********************************************************************
 !  transmit messages back in opposite direction (iprocto -> iprocfrom)
 ! *********************************************************************
@@ -605,34 +657,36 @@
 
   icount_faces = 0
   do imsg = 1,NUMMSGS_FACES
-  if(myrank==iprocfrom_faces(imsg) .or. &
-       myrank==iprocto_faces(imsg)) icount_faces = icount_faces + 1
-  if(myrank==iprocfrom_faces(imsg) .and. imsg_type(imsg) == imsg_loop) then
+  if(myrank==iprocfrom_faces(imsg) .or. myrank==iprocto_faces(imsg)) icount_faces = icount_faces + 1
+  if(myrank==iprocfrom_faces(imsg)) then
     sender = iprocto_faces(imsg)
 
 ! size of buffers is the sum of two sizes because we handle two regions in the same MPI call
     npoin2D_chunks_all = npoin2D_faces_crust_mantle(icount_faces) + npoin2D_faces_inner_core(icount_faces)
 
-! the buffer for the inner core starts right after the buffer for the crust and mantle
-    ioffset = npoin2D_faces_crust_mantle(icount_faces)
-
 #ifdef USE_MPI
-    call MPI_RECV(buffer_received_faces_vector,NDIM*npoin2D_chunks_all,CUSTOM_MPI_TYPE,sender, &
-              itag,MPI_COMM_WORLD,msg_status,ier)
+    call MPI_IRECV(buffer_received_faces_vector(:,:,icount_faces),NDIM*npoin2D_chunks_all,CUSTOM_MPI_TYPE,sender, &
+              itag,MPI_COMM_WORLD,request_receive_array(icount_faces),ier)
 #endif
 
-    do ipoin2D = 1,npoin2D_faces_crust_mantle(icount_faces)
-      accel_crust_mantle(1,iboolfaces_crust_mantle(ipoin2D,icount_faces)) = buffer_received_faces_vector(1,ipoin2D)
-      accel_crust_mantle(2,iboolfaces_crust_mantle(ipoin2D,icount_faces)) = buffer_received_faces_vector(2,ipoin2D)
-      accel_crust_mantle(3,iboolfaces_crust_mantle(ipoin2D,icount_faces)) = buffer_received_faces_vector(3,ipoin2D)
-    enddo
+!   do ipoin2D = 1,npoin2D_faces_crust_mantle(icount_faces)
+!     accel_crust_mantle(1,iboolfaces_crust_mantle(ipoin2D,icount_faces)) = buffer_received_faces_vector(1,ipoin2D,icount_faces)
+!     accel_crust_mantle(2,iboolfaces_crust_mantle(ipoin2D,icount_faces)) = buffer_received_faces_vector(2,ipoin2D,icount_faces)
+!     accel_crust_mantle(3,iboolfaces_crust_mantle(ipoin2D,icount_faces)) = buffer_received_faces_vector(3,ipoin2D,icount_faces)
+!   enddo
 
-    do ipoin2D = 1,npoin2D_faces_inner_core(icount_faces)
-      accel_inner_core(1,iboolfaces_inner_core(ipoin2D,icount_faces)) = buffer_received_faces_vector(1,ioffset + ipoin2D)
-      accel_inner_core(2,iboolfaces_inner_core(ipoin2D,icount_faces)) = buffer_received_faces_vector(2,ioffset + ipoin2D)
-      accel_inner_core(3,iboolfaces_inner_core(ipoin2D,icount_faces)) = buffer_received_faces_vector(3,ioffset + ipoin2D)
-    enddo
+! the buffer for the inner core starts right after the buffer for the crust and mantle
+!   ioffset = npoin2D_faces_crust_mantle(icount_faces)
 
+!   do ipoin2D = 1,npoin2D_faces_inner_core(icount_faces)
+!     accel_inner_core(1,iboolfaces_inner_core(ipoin2D,icount_faces)) = &
+!       buffer_received_faces_vector(1,ioffset + ipoin2D,icount_faces)
+!     accel_inner_core(2,iboolfaces_inner_core(ipoin2D,icount_faces)) = &
+!       buffer_received_faces_vector(2,ioffset + ipoin2D,icount_faces)
+!     accel_inner_core(3,iboolfaces_inner_core(ipoin2D,icount_faces)) = &
+!       buffer_received_faces_vector(3,ioffset + ipoin2D,icount_faces)
+!   enddo
+
   endif
   enddo
 
@@ -640,39 +694,91 @@
 !---- a given slice can belong to at most two faces
   icount_faces = 0
   do imsg = 1,NUMMSGS_FACES
-  if(myrank==iprocfrom_faces(imsg) .or. &
-       myrank==iprocto_faces(imsg)) icount_faces = icount_faces + 1
-  if(myrank==iprocto_faces(imsg) .and. imsg_type(imsg) == imsg_loop) then
+  if(myrank==iprocfrom_faces(imsg) .or. myrank==iprocto_faces(imsg)) icount_faces = icount_faces + 1
+  if(myrank==iprocto_faces(imsg)) then
     receiver = iprocfrom_faces(imsg)
 
 ! size of buffers is the sum of two sizes because we handle two regions in the same MPI call
     npoin2D_chunks_all = npoin2D_faces_crust_mantle(icount_faces) + npoin2D_faces_inner_core(icount_faces)
 
+    do ipoin2D = 1,npoin2D_faces_crust_mantle(icount_faces)
+      buffer_send_faces_vector(1,ipoin2D,icount_faces) = accel_crust_mantle(1,iboolfaces_crust_mantle(ipoin2D,icount_faces))
+      buffer_send_faces_vector(2,ipoin2D,icount_faces) = accel_crust_mantle(2,iboolfaces_crust_mantle(ipoin2D,icount_faces))
+      buffer_send_faces_vector(3,ipoin2D,icount_faces) = accel_crust_mantle(3,iboolfaces_crust_mantle(ipoin2D,icount_faces))
+    enddo
+
 ! the buffer for the inner core starts right after the buffer for the crust and mantle
     ioffset = npoin2D_faces_crust_mantle(icount_faces)
 
-    do ipoin2D = 1,npoin2D_faces_crust_mantle(icount_faces)
-      buffer_send_faces_vector(1,ipoin2D) = accel_crust_mantle(1,iboolfaces_crust_mantle(ipoin2D,icount_faces))
-      buffer_send_faces_vector(2,ipoin2D) = accel_crust_mantle(2,iboolfaces_crust_mantle(ipoin2D,icount_faces))
-      buffer_send_faces_vector(3,ipoin2D) = accel_crust_mantle(3,iboolfaces_crust_mantle(ipoin2D,icount_faces))
-    enddo
-
     do ipoin2D = 1,npoin2D_faces_inner_core(icount_faces)
-      buffer_send_faces_vector(1,ioffset + ipoin2D) = accel_inner_core(1,iboolfaces_inner_core(ipoin2D,icount_faces))
-      buffer_send_faces_vector(2,ioffset + ipoin2D) = accel_inner_core(2,iboolfaces_inner_core(ipoin2D,icount_faces))
-      buffer_send_faces_vector(3,ioffset + ipoin2D) = accel_inner_core(3,iboolfaces_inner_core(ipoin2D,icount_faces))
+      buffer_send_faces_vector(1,ioffset + ipoin2D,icount_faces) = accel_inner_core(1,iboolfaces_inner_core(ipoin2D,icount_faces))
+      buffer_send_faces_vector(2,ioffset + ipoin2D,icount_faces) = accel_inner_core(2,iboolfaces_inner_core(ipoin2D,icount_faces))
+      buffer_send_faces_vector(3,ioffset + ipoin2D,icount_faces) = accel_inner_core(3,iboolfaces_inner_core(ipoin2D,icount_faces))
     enddo
 
 #ifdef USE_MPI
-    call MPI_SEND(buffer_send_faces_vector,NDIM*npoin2D_chunks_all,CUSTOM_MPI_TYPE,receiver,itag,MPI_COMM_WORLD,ier)
+    call MPI_ISSEND(buffer_send_faces_vector(:,:,icount_faces),NDIM*npoin2D_chunks_all,CUSTOM_MPI_TYPE,receiver,itag, &
+                     MPI_COMM_WORLD,request_send_array(icount_faces),ier)
 #endif
+  endif
+  enddo
 
+  iphase = iphase + 1
+  return ! exit because we have started some communications therefore we need some time
+
+  endif !!!!!!!!! end of iphase 6
+
+  if(iphase == 7) then
+
+#ifdef USE_MPI
+  icount_faces = 0
+  do imsg = 1,NUMMSGS_FACES
+  if(myrank==iprocfrom_faces(imsg) .or. myrank==iprocto_faces(imsg)) icount_faces = icount_faces + 1
+  if(myrank==iprocto_faces(imsg)) then
+    call MPI_TEST(request_send_array(icount_faces),flag_result_test,msg_status,ier)
+    if(.not. flag_result_test) return ! exit if message not received yet
   endif
   enddo
 
-! end of anti-deadlocking loop
+  icount_faces = 0
+  do imsg = 1,NUMMSGS_FACES
+  if(myrank==iprocfrom_faces(imsg) .or. myrank==iprocto_faces(imsg)) icount_faces = icount_faces + 1
+  if(myrank==iprocfrom_faces(imsg)) then
+    call MPI_TEST(request_receive_array(icount_faces),flag_result_test,msg_status,ier)
+    if(.not. flag_result_test) return ! exit if message not sent yet
+  endif
   enddo
+#endif
 
+  icount_faces = 0
+  do imsg = 1,NUMMSGS_FACES
+  if(myrank==iprocfrom_faces(imsg) .or. myrank==iprocto_faces(imsg)) icount_faces = icount_faces + 1
+  if(myrank==iprocfrom_faces(imsg)) then
+    do ipoin2D = 1,npoin2D_faces_crust_mantle(icount_faces)
+      accel_crust_mantle(1,iboolfaces_crust_mantle(ipoin2D,icount_faces)) = buffer_received_faces_vector(1,ipoin2D,icount_faces)
+      accel_crust_mantle(2,iboolfaces_crust_mantle(ipoin2D,icount_faces)) = buffer_received_faces_vector(2,ipoin2D,icount_faces)
+      accel_crust_mantle(3,iboolfaces_crust_mantle(ipoin2D,icount_faces)) = buffer_received_faces_vector(3,ipoin2D,icount_faces)
+    enddo
+
+! the buffer for the inner core starts right after the buffer for the crust and mantle
+    ioffset = npoin2D_faces_crust_mantle(icount_faces)
+
+    do ipoin2D = 1,npoin2D_faces_inner_core(icount_faces)
+      accel_inner_core(1,iboolfaces_inner_core(ipoin2D,icount_faces)) = &
+        buffer_received_faces_vector(1,ioffset + ipoin2D,icount_faces)
+      accel_inner_core(2,iboolfaces_inner_core(ipoin2D,icount_faces)) = &
+        buffer_received_faces_vector(2,ioffset + ipoin2D,icount_faces)
+      accel_inner_core(3,iboolfaces_inner_core(ipoin2D,icount_faces)) = &
+        buffer_received_faces_vector(3,ioffset + ipoin2D,icount_faces)
+    enddo
+  endif
+  enddo
+
+! this is the exit condition, to go beyond the last phase number
+  iphase = iphase + 1
+
+!! DK DK do the rest in blocking for now, for simplicity
+
 !----
 !---- start MPI assembling corners
 !----
@@ -858,7 +964,7 @@
 
   enddo
 
-  endif !!!!!!!!! end of iphase 5
+  endif !!!!!!!!! end of iphase 7
 
   end subroutine assemble_MPI_vector
 

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/compute_forces_CM_IC.F90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/compute_forces_CM_IC.F90	2008-09-18 21:34:10 UTC (rev 12915)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/compute_forces_CM_IC.F90	2008-09-19 10:17:21 UTC (rev 12916)
@@ -54,11 +54,13 @@
             iboolleft_xi_inner_core,iboolright_xi_inner_core,iboolleft_eta_inner_core,iboolright_eta_inner_core, &
             npoin2D_faces_inner_core,npoin2D_xi_inner_core,npoin2D_eta_inner_core, &
             iboolfaces_inner_core,iboolcorner_inner_core, &
-            iprocfrom_faces,iprocto_faces,imsg_type, &
+            iprocfrom_faces,iprocto_faces, &
             iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
             buffer_send_faces,buffer_received_faces,npoin2D_max_all, &
-            buffer_send_chunkcorners_vector,buffer_recv_chunkcorners_vector, &
-            NUM_MSG_TYPES,iphase, &
+            buffer_send_chunkcorners_vector,buffer_recv_chunkcorners_vector,iphase, &
+               nb_msgs_theor_in_cube,sender_from_slices_to_cube, &
+               npoin2D_cube_from_slices,buffer_all_cube_from_slices,buffer_slices,ibool_central_cube, &
+               receiver_cube_from_slices,ibelm_bottom_inner_core,NSPEC2D_BOTTOM_INNER_CORE,INCLUDE_CENTRAL_CUBE,iphase_CC, &
 #endif
           COMPUTE_AND_STORE_STRAIN,AM_V,icall)
 
@@ -249,7 +251,7 @@
   integer, dimension(NB_SQUARE_EDGES_ONEDIR) :: npoin2D_xi_inner_core,npoin2D_eta_inner_core
 
 ! communication pattern for faces between chunks
-  integer, dimension(NUMMSGS_FACES_VAL) :: iprocfrom_faces,iprocto_faces,imsg_type
+  integer, dimension(NUMMSGS_FACES_VAL) :: iprocfrom_faces,iprocto_faces
 
 ! communication pattern for corners between chunks
   integer, dimension(NCORNERSCHUNKS_VAL) :: iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners
@@ -268,8 +270,19 @@
   real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB1D_RADIAL_CM + NGLOB1D_RADIAL_IC) :: &
      buffer_send_chunkcorners_vector,buffer_recv_chunkcorners_vector
 
-! number of message types
-  integer NUM_MSG_TYPES
+! for matching with central cube in inner core
+  integer nb_msgs_theor_in_cube, npoin2D_cube_from_slices,iphase_CC
+  integer, dimension(nb_msgs_theor_in_cube) :: sender_from_slices_to_cube
+  double precision, dimension(npoin2D_cube_from_slices,NDIM) :: buffer_slices
+  double precision, dimension(npoin2D_cube_from_slices,NDIM,nb_msgs_theor_in_cube) :: buffer_all_cube_from_slices
+  integer, dimension(nb_msgs_theor_in_cube,npoin2D_cube_from_slices):: ibool_central_cube
+  integer receiver_cube_from_slices
+  logical :: INCLUDE_CENTRAL_CUBE
+
+! local to global mapping
+  integer NSPEC2D_BOTTOM_INNER_CORE
+  integer, dimension(NSPEC2D_BOTTOM_INNER_CORE) :: ibelm_bottom_inner_core
+
 #endif
 
 ! ****************************************************
@@ -290,8 +303,9 @@
 
 ! process the communications every ELEMENTS_BETWEEN_NONBLOCKING elements
     computed_elements = computed_elements + 1
-    if (USE_NONBLOCKING_COMMS .and. icall == 2 .and. mod(computed_elements,ELEMENTS_BETWEEN_NONBLOCKING) == 0) &
-         call assemble_MPI_vector(myrank,accel_crust_mantle,accel_inner_core, &
+    if (USE_NONBLOCKING_COMMS .and. icall == 2 .and. mod(computed_elements,ELEMENTS_BETWEEN_NONBLOCKING) == 0) then
+
+      if(iphase <= 7) call assemble_MPI_vector(myrank,accel_crust_mantle,accel_inner_core, &
             iproc_xi,iproc_eta,ichunk,addressing, &
             iboolleft_xi_crust_mantle,iboolright_xi_crust_mantle,iboolleft_eta_crust_mantle,iboolright_eta_crust_mantle, &
             npoin2D_faces_crust_mantle,npoin2D_xi_crust_mantle(1),npoin2D_eta_crust_mantle(1), &
@@ -299,13 +313,23 @@
             iboolleft_xi_inner_core,iboolright_xi_inner_core,iboolleft_eta_inner_core,iboolright_eta_inner_core, &
             npoin2D_faces_inner_core,npoin2D_xi_inner_core(1),npoin2D_eta_inner_core(1), &
             iboolfaces_inner_core,iboolcorner_inner_core, &
-            iprocfrom_faces,iprocto_faces,imsg_type, &
+            iprocfrom_faces,iprocto_faces, &
             iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
             buffer_send_faces,buffer_received_faces,npoin2D_max_all, &
             buffer_send_chunkcorners_vector,buffer_recv_chunkcorners_vector, &
-            NUMMSGS_FACES_VAL,NUM_MSG_TYPES,NCORNERSCHUNKS_VAL, &
+            NUMMSGS_FACES_VAL,NCORNERSCHUNKS_VAL, &
             NPROC_XI_VAL,NPROC_ETA_VAL,NGLOB1D_RADIAL_CM, &
             NGLOB1D_RADIAL_IC,NCHUNKS_VAL,iphase)
+
+      if(INCLUDE_CENTRAL_CUBE) then
+          if(iphase > 7 .and. iphase_CC <= 4) &
+            call assemble_MPI_central_cube(ichunk,nb_msgs_theor_in_cube,sender_from_slices_to_cube, &
+              npoin2D_cube_from_slices,buffer_all_cube_from_slices,buffer_slices,ibool_central_cube, &
+              receiver_cube_from_slices,ibool_inner_core,idoubling_inner_core, &
+              ibelm_bottom_inner_core,NSPEC2D_BOTTOM_INNER_CORE,accel_inner_core,NDIM,iphase_CC)
+      endif
+
+    endif
 #endif
 
     do k=1,NGLLZ
@@ -805,11 +829,11 @@
             iboolleft_xi_inner_core,iboolright_xi_inner_core,iboolleft_eta_inner_core,iboolright_eta_inner_core, &
             npoin2D_faces_inner_core,npoin2D_xi_inner_core(1),npoin2D_eta_inner_core(1), &
             iboolfaces_inner_core,iboolcorner_inner_core, &
-            iprocfrom_faces,iprocto_faces,imsg_type, &
+            iprocfrom_faces,iprocto_faces, &
             iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
             buffer_send_faces,buffer_received_faces,npoin2D_max_all, &
             buffer_send_chunkcorners_vector,buffer_recv_chunkcorners_vector, &
-            NUMMSGS_FACES_VAL,NUM_MSG_TYPES,NCORNERSCHUNKS_VAL, &
+            NUMMSGS_FACES_VAL,NCORNERSCHUNKS_VAL, &
             NPROC_XI_VAL,NPROC_ETA_VAL,NGLOB1D_RADIAL_CM, &
             NGLOB1D_RADIAL_IC,NCHUNKS_VAL,iphase)
 #endif

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/compute_forces_OC.F90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/compute_forces_OC.F90	2008-09-18 21:34:10 UTC (rev 12915)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/compute_forces_OC.F90	2008-09-19 10:17:21 UTC (rev 12916)
@@ -37,11 +37,10 @@
           iboolleft_xi_outer_core,iboolright_xi_outer_core,iboolleft_eta_outer_core,iboolright_eta_outer_core, &
           npoin2D_faces_outer_core,npoin2D_xi_outer_core,npoin2D_eta_outer_core, &
           iboolfaces_outer_core,iboolcorner_outer_core, &
-          iprocfrom_faces,iprocto_faces,imsg_type, &
+          iprocfrom_faces,iprocto_faces, &
           iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
           buffer_send_faces,buffer_received_faces,npoin2D_max_all, &
-          buffer_send_chunkcorners_scalar,buffer_recv_chunkcorners_scalar, &
-          NUM_MSG_TYPES,iphase, &
+          buffer_send_chunkcorners_scalar,buffer_recv_chunkcorners_scalar,iphase, &
 #endif
           hprime_xx,hprime_yy,hprime_zz, &
           hprimewgll_xx,hprimewgll_yy,hprimewgll_zz, &
@@ -67,7 +66,7 @@
   integer, dimension(NB_SQUARE_EDGES_ONEDIR) :: npoin2D_xi_outer_core,npoin2D_eta_outer_core
 
 ! communication pattern for faces between chunks
-  integer, dimension(NUMMSGS_FACES_VAL) :: iprocfrom_faces,iprocto_faces,imsg_type
+  integer, dimension(NUMMSGS_FACES_VAL) :: iprocfrom_faces,iprocto_faces
 
 ! communication pattern for corners between chunks
   integer, dimension(NCORNERSCHUNKS_VAL) :: iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners
@@ -82,15 +81,12 @@
 ! of the vector buffer in memory even if it has an additional index here
 ! allocate these automatic arrays in the memory stack to avoid memory fragmentation with "allocate()"
   integer :: npoin2D_max_all
-  real(kind=CUSTOM_REAL), dimension(NDIM,npoin2D_max_all) :: buffer_send_faces,buffer_received_faces
+  real(kind=CUSTOM_REAL), dimension(NDIM,npoin2D_max_all,NUMFACES_SHARED) :: buffer_send_faces,buffer_received_faces
 
   integer, dimension(NGLOB1D_RADIAL_OC,NUMCORNERS_SHARED) :: iboolcorner_outer_core
 
   real(kind=CUSTOM_REAL), dimension(NGLOB1D_RADIAL_OC) :: buffer_send_chunkcorners_scalar,buffer_recv_chunkcorners_scalar
 
-! number of message types
-  integer NUM_MSG_TYPES
-
   logical, dimension(NSPEC_OUTER_CORE) :: is_on_a_slice_edge_outer_core
 
   integer :: iphase
@@ -158,11 +154,11 @@
             iboolleft_xi_outer_core,iboolright_xi_outer_core,iboolleft_eta_outer_core,iboolright_eta_outer_core, &
             npoin2D_faces_outer_core,npoin2D_xi_outer_core,npoin2D_eta_outer_core, &
             iboolfaces_outer_core,iboolcorner_outer_core, &
-            iprocfrom_faces,iprocto_faces,imsg_type, &
+            iprocfrom_faces,iprocto_faces, &
             iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
-            buffer_send_faces,buffer_received_faces,npoin2D_max_all, &
+            buffer_send_faces,buffer_received_faces,npoin2D_xi_outer_core(1), &
             buffer_send_chunkcorners_scalar,buffer_recv_chunkcorners_scalar, &
-            NUMMSGS_FACES_VAL,NUM_MSG_TYPES,NCORNERSCHUNKS_VAL, &
+            NUMMSGS_FACES_VAL,NCORNERSCHUNKS_VAL, &
             NPROC_XI_VAL,NPROC_ETA_VAL,NGLOB1D_RADIAL_OC, &
             NGLOB2DMAX_XMIN_XMAX_OC,NGLOB2DMAX_YMIN_YMAX_OC,NGLOB2DMAX_XY_VAL_OC,NCHUNKS_VAL,iphase)
 #endif

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/create_central_cube_buffers.F90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/create_central_cube_buffers.F90	2008-09-18 21:34:10 UTC (rev 12915)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/create_central_cube_buffers.F90	2008-09-19 10:17:21 UTC (rev 12916)
@@ -78,7 +78,7 @@
   integer, dimension(non_zero_nb_msgs_theor_in_cube), intent(out) :: sender_from_slices_to_cube
   integer, dimension(non_zero_nb_msgs_theor_in_cube,npoin2D_cube_from_slices), intent(out) :: ibool_central_cube
   double precision, dimension(npoin2D_cube_from_slices,NDIM), intent(out) :: buffer_slices,buffer_slices2
-  double precision, dimension(non_zero_nb_msgs_theor_in_cube,npoin2D_cube_from_slices,NDIM), intent(out) :: &
+  double precision, dimension(npoin2D_cube_from_slices,NDIM,non_zero_nb_msgs_theor_in_cube), intent(out) :: &
         buffer_all_cube_from_slices
 
 ! local variables below
@@ -246,7 +246,7 @@
 #endif
 
 ! copy buffer in 2D array for each slice
-    buffer_all_cube_from_slices(imsg,:,:) = buffer_slices(:,:)
+    buffer_all_cube_from_slices(:,:,imsg) = buffer_slices(:,:)
 
     enddo
   endif
@@ -313,7 +313,7 @@
     buffer_slices2 = 0
 #endif
 
-    buffer_all_cube_from_slices(nb_msgs_theor_in_cube,:,:) = buffer_slices2(:,:)
+    buffer_all_cube_from_slices(:,:,nb_msgs_theor_in_cube) = buffer_slices2(:,:)
 
   endif
 
@@ -325,9 +325,9 @@
 
    do ipoin = 1,npoin2D_cube_from_slices
 
-     x_target = buffer_all_cube_from_slices(imsg,ipoin,1)
-     y_target = buffer_all_cube_from_slices(imsg,ipoin,2)
-     z_target = buffer_all_cube_from_slices(imsg,ipoin,3)
+     x_target = buffer_all_cube_from_slices(ipoin,1,imsg)
+     y_target = buffer_all_cube_from_slices(ipoin,2,imsg)
+     z_target = buffer_all_cube_from_slices(ipoin,3,imsg)
 
 ! x = x_min
   do ispec2D = 1,nspec2D_xmin_inner_core

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/extract_all_seismos_from_large_file.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/extract_all_seismos_from_large_file.f90	2008-09-18 21:34:10 UTC (rev 12915)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/extract_all_seismos_from_large_file.f90	2008-09-19 10:17:21 UTC (rev 12916)
@@ -37,7 +37,7 @@
 
 ! number of seismogram files stored in the unique large file
   integer, parameter :: N_COMPONENTS = 3
-  integer, parameter :: NREC = 35 * N_COMPONENTS
+  integer, parameter :: NREC = 56 * N_COMPONENTS
 
 ! number of time steps in each seismogram file
   integer, parameter :: NSTEP = 10100

Deleted: seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/fix_non_blocking_arrays.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/fix_non_blocking_arrays.f90	2008-09-18 21:34:10 UTC (rev 12915)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/fix_non_blocking_arrays.f90	2008-09-19 10:17:21 UTC (rev 12916)
@@ -1,89 +0,0 @@
-!=====================================================================
-!
-!          S p e c f e m 3 D  G l o b e  V e r s i o n  4 . 1
-!          --------------------------------------------------
-!
-!          Main authors: Dimitri Komatitsch and Jeroen Tromp
-!    Seismological Laboratory, California Institute of Technology, USA
-!             and University of Pau / CNRS / INRIA, France
-! (c) California Institute of Technology and University of Pau / CNRS / INRIA
-!                            August 2008
-!
-! This program is free software; you can redistribute it and/or modify
-! it under the terms of the GNU General Public License as published by
-! the Free Software Foundation; either version 2 of the License, or
-! (at your option) any later version.
-!
-! This program is distributed in the hope that it will be useful,
-! but WITHOUT ANY WARRANTY; without even the implied warranty of
-! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-! GNU General Public License for more details.
-!
-! You should have received a copy of the GNU General Public License along
-! with this program; if not, write to the Free Software Foundation, Inc.,
-! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-!
-!=====================================================================
-
-!! DK DK fix the non-blocking arrays to assemble inside the chunks: elements
-!! DK DK in contact with the MPI faces by an edge or a corner only but not
-!! DK DK a full face are missing, therefore let us add them
-  subroutine fix_non_blocking_arrays(is_on_a_slice_edge,iboolright_xi, &
-         iboolleft_xi,iboolright_eta,iboolleft_eta, &
-         npoin2D_xi,npoin2D_eta,ibool, &
-         mask_ibool,nspec,nglob,NGLOB2DMAX_XMIN_XMAX,NGLOB2DMAX_YMIN_YMAX)
-
-  implicit none
-
-  include "constants.h"
-
-  integer :: npoin2D_xi,npoin2D_eta,nspec,nglob,NGLOB2DMAX_XMIN_XMAX,NGLOB2DMAX_YMIN_YMAX
-
-  logical, dimension(nspec) :: is_on_a_slice_edge
-
-  integer, dimension(NGLOB2DMAX_XMIN_XMAX) :: iboolleft_xi,iboolright_xi
-  integer, dimension(NGLOB2DMAX_YMIN_YMAX) :: iboolleft_eta,iboolright_eta
-
-  integer, dimension(NGLLX,NGLLY,NGLLZ,nspec) :: ibool
-
-! this mask is declared as integer in the calling program because it is used elsewhere
-! to store integers, and it is reused here as a logical to save memory
-  logical, dimension(nglob) :: mask_ibool
-
-  integer :: ipoin,ispec,i,j,k
-
-! clean the mask
-  mask_ibool(:) = .false.
-
-! mark all the points that are in the MPI buffers to assemble inside each chunk
-  do ipoin = 1,npoin2D_xi
-    mask_ibool(iboolright_xi(ipoin)) = .true.
-    mask_ibool(iboolleft_xi(ipoin)) = .true.
-  enddo
-
-  do ipoin = 1,npoin2D_eta
-    mask_ibool(iboolright_eta(ipoin)) = .true.
-    mask_ibool(iboolleft_eta(ipoin)) = .true.
-  enddo
-
-! now label all the elements that have at least one corner belonging
-! to any of these buffers as elements that must contribute to the
-! first step of the calculations (performed on the edges before starting
-! the non-blocking communications); there is no need to examine the inside
-! of the elements, checking their eight corners is sufficient
-  do ispec = 1,nspec
-    do k = 1,NGLLZ,NGLLZ-1
-      do j  = 1,NGLLY,NGLLY-1
-        do i = 1,NGLLX,NGLLX-1
-          if(mask_ibool(ibool(i,j,k,ispec))) then
-            is_on_a_slice_edge(ispec) = .true.
-            goto 888
-          endif
-        enddo
-      enddo
-    enddo
-  888 continue
-  enddo
-
-  end subroutine fix_non_blocking_arrays
-

Added: seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/fix_non_blocking_central_cube.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/fix_non_blocking_central_cube.f90	                        (rev 0)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/fix_non_blocking_central_cube.f90	2008-09-19 10:17:21 UTC (rev 12916)
@@ -0,0 +1,106 @@
+!=====================================================================
+!
+!          S p e c f e m 3 D  G l o b e  V e r s i o n  4 . 1
+!          --------------------------------------------------
+!
+!          Main authors: Dimitri Komatitsch and Jeroen Tromp
+!    Seismological Laboratory, California Institute of Technology, USA
+!             and University of Pau / CNRS / INRIA, France
+! (c) California Institute of Technology and University of Pau / CNRS / INRIA
+!                            August 2008
+!
+! This program is free software; you can redistribute it and/or modify
+! it under the terms of the GNU General Public License as published by
+! the Free Software Foundation; either version 2 of the License, or
+! (at your option) any later version.
+!
+! This program is distributed in the hope that it will be useful,
+! but WITHOUT ANY WARRANTY; without even the implied warranty of
+! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+! GNU General Public License for more details.
+!
+! You should have received a copy of the GNU General Public License along
+! with this program; if not, write to the Free Software Foundation, Inc.,
+! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+!
+!=====================================================================
+
+!! DK DK fix the non-blocking arrays to assemble the central cube: elements
+!! DK DK in contact with the MPI faces by an edge or a corner only but not
+!! DK DK a full face are missing, therefore let us add them
+  subroutine fix_non_blocking_central_cube(is_on_a_slice_edge, &
+         ibool,nspec,nglob,nb_msgs_theor_in_cube,ibelm_bottom_inner_core, &
+         idoubling_inner_core,npoin2D_cube_from_slices,ibool_central_cube,NSPEC2D_BOTTOM_INNER_CORE,ichunk)
+
+  implicit none
+
+  include "constants.h"
+
+  integer :: nspec,nglob,nb_msgs_theor_in_cube,NSPEC2D_BOTTOM_INNER_CORE,ichunk,npoin2D_cube_from_slices
+
+  logical, dimension(nspec) :: is_on_a_slice_edge
+
+  integer, dimension(NGLLX,NGLLY,NGLLZ,nspec) :: ibool
+
+  integer, dimension(nb_msgs_theor_in_cube,npoin2D_cube_from_slices) :: ibool_central_cube
+
+  integer, dimension(NSPEC2D_BOTTOM_INNER_CORE) :: ibelm_bottom_inner_core
+
+! local to global mapping
+  integer, dimension(nspec) :: idoubling_inner_core
+
+! this mask is declared as integer in the calling program because it is used elsewhere
+! to store integers, and it is reused here as a logical to save memory
+  logical, dimension(nglob) :: mask_ibool
+
+  integer :: ipoin,ispec,i,j,k,imsg,ispec2D
+
+  if(ichunk /= CHUNK_AB .and. ichunk /= CHUNK_AB_ANTIPODE) then
+    do ispec2D = 1,NSPEC2D_BOTTOM_INNER_CORE
+      ispec = ibelm_bottom_inner_core(ispec2D)
+      is_on_a_slice_edge(ispec) = .true.
+    enddo
+  endif
+
+  if(ichunk == CHUNK_AB .or. ichunk == CHUNK_AB_ANTIPODE) then
+    do ispec = 1,nspec
+      if(idoubling_inner_core(ispec) == IFLAG_BOTTOM_CENTRAL_CUBE .or. &
+         idoubling_inner_core(ispec) == IFLAG_TOP_CENTRAL_CUBE) &
+        is_on_a_slice_edge(ispec) = .true.
+    enddo
+  endif
+
+  if(ichunk == CHUNK_AB .or. ichunk == CHUNK_AB_ANTIPODE) then
+
+! clean the mask
+  mask_ibool(:) = .false.
+
+    do imsg = 1,nb_msgs_theor_in_cube
+      do ipoin = 1,npoin2D_cube_from_slices
+        mask_ibool(ibool_central_cube(imsg,ipoin)) = .true.
+      enddo
+    enddo
+
+! now label all the elements that have at least one corner belonging
+! to any of these buffers as elements that must contribute to the
+! first step of the calculations (performed on the edges before starting
+! the non-blocking communications); there is no need to examine the inside
+! of the elements, checking their eight corners is sufficient
+  do ispec = 1,nspec
+    do k = 1,NGLLZ,NGLLZ-1
+      do j  = 1,NGLLY,NGLLY-1
+        do i = 1,NGLLX,NGLLX-1
+          if(mask_ibool(ibool(i,j,k,ispec))) then
+            is_on_a_slice_edge(ispec) = .true.
+            goto 888
+          endif
+        enddo
+      enddo
+    enddo
+  888 continue
+  enddo
+
+  endif
+
+  end subroutine fix_non_blocking_central_cube
+

Copied: seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/fix_non_blocking_slices.f90 (from rev 12912, seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/fix_non_blocking_arrays.f90)
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/fix_non_blocking_slices.f90	                        (rev 0)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/fix_non_blocking_slices.f90	2008-09-19 10:17:21 UTC (rev 12916)
@@ -0,0 +1,89 @@
+!=====================================================================
+!
+!          S p e c f e m 3 D  G l o b e  V e r s i o n  4 . 1
+!          --------------------------------------------------
+!
+!          Main authors: Dimitri Komatitsch and Jeroen Tromp
+!    Seismological Laboratory, California Institute of Technology, USA
+!             and University of Pau / CNRS / INRIA, France
+! (c) California Institute of Technology and University of Pau / CNRS / INRIA
+!                            August 2008
+!
+! This program is free software; you can redistribute it and/or modify
+! it under the terms of the GNU General Public License as published by
+! the Free Software Foundation; either version 2 of the License, or
+! (at your option) any later version.
+!
+! This program is distributed in the hope that it will be useful,
+! but WITHOUT ANY WARRANTY; without even the implied warranty of
+! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+! GNU General Public License for more details.
+!
+! You should have received a copy of the GNU General Public License along
+! with this program; if not, write to the Free Software Foundation, Inc.,
+! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+!
+!=====================================================================
+
+!! DK DK fix the non-blocking arrays to assemble inside the chunks: elements
+!! DK DK in contact with the MPI faces by an edge or a corner only but not
+!! DK DK a full face are missing, therefore let us add them
+  subroutine fix_non_blocking_slices(is_on_a_slice_edge,iboolright_xi, &
+         iboolleft_xi,iboolright_eta,iboolleft_eta, &
+         npoin2D_xi,npoin2D_eta,ibool, &
+         mask_ibool,nspec,nglob,NGLOB2DMAX_XMIN_XMAX,NGLOB2DMAX_YMIN_YMAX)
+
+  implicit none
+
+  include "constants.h"
+
+  integer :: npoin2D_xi,npoin2D_eta,nspec,nglob,NGLOB2DMAX_XMIN_XMAX,NGLOB2DMAX_YMIN_YMAX
+
+  logical, dimension(nspec) :: is_on_a_slice_edge
+
+  integer, dimension(NGLOB2DMAX_XMIN_XMAX) :: iboolleft_xi,iboolright_xi
+  integer, dimension(NGLOB2DMAX_YMIN_YMAX) :: iboolleft_eta,iboolright_eta
+
+  integer, dimension(NGLLX,NGLLY,NGLLZ,nspec) :: ibool
+
+! this mask is declared as integer in the calling program because it is used elsewhere
+! to store integers, and it is reused here as a logical to save memory
+  logical, dimension(nglob) :: mask_ibool
+
+  integer :: ipoin,ispec,i,j,k
+
+! clean the mask
+  mask_ibool(:) = .false.
+
+! mark all the points that are in the MPI buffers to assemble inside each chunk
+  do ipoin = 1,npoin2D_xi
+    mask_ibool(iboolright_xi(ipoin)) = .true.
+    mask_ibool(iboolleft_xi(ipoin)) = .true.
+  enddo
+
+  do ipoin = 1,npoin2D_eta
+    mask_ibool(iboolright_eta(ipoin)) = .true.
+    mask_ibool(iboolleft_eta(ipoin)) = .true.
+  enddo
+
+! now label all the elements that have at least one corner belonging
+! to any of these buffers as elements that must contribute to the
+! first step of the calculations (performed on the edges before starting
+! the non-blocking communications); there is no need to examine the inside
+! of the elements, checking their eight corners is sufficient
+  do ispec = 1,nspec
+    do k = 1,NGLLZ,NGLLZ-1
+      do j  = 1,NGLLY,NGLLY-1
+        do i = 1,NGLLX,NGLLX-1
+          if(mask_ibool(ibool(i,j,k,ispec))) then
+            is_on_a_slice_edge(ispec) = .true.
+            goto 888
+          endif
+        enddo
+      enddo
+    enddo
+  888 continue
+  enddo
+
+  end subroutine fix_non_blocking_slices
+

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/main_program.F90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/main_program.F90	2008-09-18 21:34:10 UTC (rev 12915)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/main_program.F90	2008-09-19 10:17:21 UTC (rev 12916)
@@ -530,17 +530,17 @@
 !! DK DK in contact with the MPI faces by an edge or a corner only but not
 !! DK DK a full face are missing, therefore let us add them
 #ifdef USE_MPI
-  call fix_non_blocking_arrays(is_on_a_slice_edge_crust_mantle,iboolright_xi_crust_mantle, &
+  call fix_non_blocking_slices(is_on_a_slice_edge_crust_mantle,iboolright_xi_crust_mantle, &
          iboolleft_xi_crust_mantle,iboolright_eta_crust_mantle,iboolleft_eta_crust_mantle, &
          npoin2D_xi_crust_mantle(1),npoin2D_eta_crust_mantle(1),ibool_crust_mantle, &
          mask_ibool,NSPEC_CRUST_MANTLE,NGLOB_CRUST_MANTLE,NGLOB2DMAX_XMIN_XMAX_CM,NGLOB2DMAX_YMIN_YMAX_CM)
 
-  call fix_non_blocking_arrays(is_on_a_slice_edge_outer_core,iboolright_xi_outer_core, &
+  call fix_non_blocking_slices(is_on_a_slice_edge_outer_core,iboolright_xi_outer_core, &
          iboolleft_xi_outer_core,iboolright_eta_outer_core,iboolleft_eta_outer_core, &
          npoin2D_xi_outer_core(1),npoin2D_eta_outer_core(1),ibool_outer_core, &
          mask_ibool,NSPEC_OUTER_CORE,NGLOB_OUTER_CORE,NGLOB2DMAX_XMIN_XMAX_OC,NGLOB2DMAX_YMIN_YMAX_OC)
 
-  call fix_non_blocking_arrays(is_on_a_slice_edge_inner_core,iboolright_xi_inner_core, &
+  call fix_non_blocking_slices(is_on_a_slice_edge_inner_core,iboolright_xi_inner_core, &
          iboolleft_xi_inner_core,iboolright_eta_inner_core,iboolleft_eta_inner_core, &
          npoin2D_xi_inner_core(1),npoin2D_eta_inner_core(1),ibool_inner_core, &
          mask_ibool,NSPEC_INNER_CORE,NGLOB_INNER_CORE,NGLOB2DMAX_XMIN_XMAX_IC,NGLOB2DMAX_YMIN_YMAX_IC)

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/specfem3D.F90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/specfem3D.F90	2008-09-18 21:34:10 UTC (rev 12915)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/version41_beta/src/specfem3D.F90	2008-09-19 10:17:21 UTC (rev 12916)
@@ -197,6 +197,7 @@
 #ifdef USE_MPI
 
   integer :: ipoin
+  integer :: iphase_CC
 
 ! communication pattern for faces between chunks
   integer, dimension(NUMMSGS_FACES_VAL) :: iprocfrom_faces,iprocto_faces,imsg_type
@@ -216,7 +217,7 @@
 ! of the vector buffer in memory even if it has an additional index here
 ! allocate these automatic arrays in the memory stack to avoid memory fragmentation with "allocate()"
   integer :: npoin2D_max_all
-  real(kind=CUSTOM_REAL), dimension(NDIM,npoin2D_max_all) :: buffer_send_faces,buffer_received_faces
+  real(kind=CUSTOM_REAL), dimension(NDIM,npoin2D_max_all,NUMFACES_SHARED) :: buffer_send_faces,buffer_received_faces
 
 #endif
 
@@ -1296,7 +1297,7 @@
       call MPI_Abort(MPI_COMM_WORLD,errorcode,ier)
     endif
 
-    allocate(buffer_all_cube_from_slices(non_zero_nb_msgs_theor_in_cube,npoin2D_cube_from_slices,NDIM),STAT=ier)
+    allocate(buffer_all_cube_from_slices(npoin2D_cube_from_slices,NDIM,non_zero_nb_msgs_theor_in_cube),STAT=ier)
     if (ier /= 0 ) then
       print *,"ABORTING can not allocate in specfem3D ier=",ier
       call MPI_Abort(MPI_COMM_WORLD,errorcode,ier)
@@ -1334,16 +1335,21 @@
        receiver_cube_from_slices,sender_from_slices_to_cube,ibool_central_cube, &
        buffer_slices,buffer_slices2,buffer_all_cube_from_slices)
 
+    call fix_non_blocking_central_cube(is_on_a_slice_edge_inner_core, &
+         ibool_inner_core,NSPEC_INNER_CORE,NGLOB_INNER_CORE,nb_msgs_theor_in_cube,ibelm_bottom_inner_core, &
+         idoubling_inner_core,npoin2D_cube_from_slices,ibool_central_cube, &
+         NSPEC2D_BOTTOM(IREGION_INNER_CORE),ichunk)
+
     if(myrank == 0) write(IMAIN,*) 'done including central cube'
 
 ! the mass matrix to assemble is a scalar, not a vector
     ndim_assemble = 1
 
 ! use these buffers to assemble the inner core mass matrix with the central cube
-    call assemble_MPI_central_cube_block(ichunk,nb_msgs_theor_in_cube, sender_from_slices_to_cube, &
-     npoin2D_cube_from_slices, buffer_all_cube_from_slices, buffer_slices, buffer_slices2, ibool_central_cube, &
-     receiver_cube_from_slices, ibool_inner_core, idoubling_inner_core, NSPEC_INNER_CORE, &
-     ibelm_bottom_inner_core, NSPEC2D_BOTTOM(IREGION_INNER_CORE),NGLOB_INNER_CORE,rmass_inner_core,ndim_assemble)
+    call assemble_MPI_central_cube_block(ichunk,nb_msgs_theor_in_cube,sender_from_slices_to_cube, &
+     npoin2D_cube_from_slices,buffer_all_cube_from_slices,buffer_slices,buffer_slices2,ibool_central_cube, &
+     receiver_cube_from_slices,ibool_inner_core,idoubling_inner_core,NSPEC_INNER_CORE, &
+     ibelm_bottom_inner_core,NSPEC2D_BOTTOM(IREGION_INNER_CORE),NGLOB_INNER_CORE,rmass_inner_core,ndim_assemble)
 
 ! suppress fictitious mass matrix elements in central cube
 ! because the slices do not compute all their spectral elements in the cube
@@ -1662,6 +1668,18 @@
     is_on_a_slice_edge_inner_core(:) = .true.
   endif
 
+#ifdef USE_MPI
+  if(USE_NONBLOCKING_COMMS) then
+     if(npoin2D_xi_crust_mantle(1) /= npoin2D_eta_crust_mantle(1) .or. &
+        npoin2D_xi_crust_mantle(2) /= npoin2D_eta_crust_mantle(2) .or. &
+        npoin2D_xi_outer_core(1) /= npoin2D_eta_outer_core(1) .or. &
+        npoin2D_xi_outer_core(2) /= npoin2D_eta_outer_core(2) .or. &
+        npoin2D_xi_inner_core(1) /= npoin2D_eta_inner_core(1) .or. &
+        npoin2D_xi_inner_core(2) /= npoin2D_eta_inner_core(2)) &
+       stop 'non-blocking scheme temporarily requires npoin2D_xi = npoin2D_eta because of the size of some buffers reused'
+  endif
+#endif
+
   vx_crust_mantle = size(factor_common_crust_mantle,2)
   vy_crust_mantle = size(factor_common_crust_mantle,3)
   vz_crust_mantle = size(factor_common_crust_mantle,4)
@@ -2008,11 +2026,10 @@
           iboolleft_xi_outer_core,iboolright_xi_outer_core,iboolleft_eta_outer_core,iboolright_eta_outer_core, &
           npoin2D_faces_outer_core,npoin2D_xi_outer_core,npoin2D_eta_outer_core, &
           iboolfaces_outer_core,iboolcorner_outer_core, &
-          iprocfrom_faces,iprocto_faces,imsg_type, &
+          iprocfrom_faces,iprocto_faces, &
           iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
           buffer_send_faces,buffer_received_faces,npoin2D_max_all, &
-          buffer_send_chunkcorners_scalar,buffer_recv_chunkcorners_scalar, &
-          NUM_MSG_TYPES,iphase, &
+          buffer_send_chunkcorners_scalar,buffer_recv_chunkcorners_scalar,iphase, &
 #endif
           hprime_xx,hprime_yy,hprime_zz, &
           hprimewgll_xx,hprimewgll_yy,hprimewgll_zz, &
@@ -2144,11 +2161,11 @@
             iboolleft_xi_outer_core,iboolright_xi_outer_core,iboolleft_eta_outer_core,iboolright_eta_outer_core, &
             npoin2D_faces_outer_core,npoin2D_xi_outer_core,npoin2D_eta_outer_core, &
             iboolfaces_outer_core,iboolcorner_outer_core, &
-            iprocfrom_faces,iprocto_faces,imsg_type, &
+            iprocfrom_faces,iprocto_faces, &
             iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
-            buffer_send_faces,buffer_received_faces,npoin2D_max_all, &
+            buffer_send_faces,buffer_received_faces,npoin2D_xi_outer_core(1), &
             buffer_send_chunkcorners_scalar,buffer_recv_chunkcorners_scalar, &
-            NUMMSGS_FACES,NUM_MSG_TYPES,NCORNERSCHUNKS, &
+            NUMMSGS_FACES,NCORNERSCHUNKS, &
             NPROC_XI,NPROC_ETA,NGLOB1D_RADIAL(IREGION_OUTER_CORE), &
             NGLOB2DMAX_XMIN_XMAX(IREGION_OUTER_CORE),NGLOB2DMAX_YMIN_YMAX(IREGION_OUTER_CORE),NGLOB2DMAX_XY_VAL_OC,NCHUNKS,iphase)
   endif
@@ -2167,11 +2184,10 @@
           iboolleft_xi_outer_core,iboolright_xi_outer_core,iboolleft_eta_outer_core,iboolright_eta_outer_core, &
           npoin2D_faces_outer_core,npoin2D_xi_outer_core,npoin2D_eta_outer_core, &
           iboolfaces_outer_core,iboolcorner_outer_core, &
-          iprocfrom_faces,iprocto_faces,imsg_type, &
+          iprocfrom_faces,iprocto_faces, &
           iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
           buffer_send_faces,buffer_received_faces,npoin2D_max_all, &
-          buffer_send_chunkcorners_scalar,buffer_recv_chunkcorners_scalar, &
-          NUM_MSG_TYPES,iphase, &
+          buffer_send_chunkcorners_scalar,buffer_recv_chunkcorners_scalar,iphase, &
           hprime_xx,hprime_yy,hprime_zz, &
           hprimewgll_xx,hprimewgll_yy,hprimewgll_zz, &
           wgllwgll_xy,wgllwgll_xz,wgllwgll_yz,ibool_outer_core,icall)
@@ -2182,17 +2198,17 @@
 ! outer core
 #ifdef USE_MPI
   if(USE_NONBLOCKING_COMMS) then
-    do while (iphase <= 5) ! make sure the last communications are finished and processed
+    do while (iphase <= 7) ! make sure the last communications are finished and processed
       call assemble_MPI_scalar(myrank,accel_outer_core,NGLOB_OUTER_CORE, &
             iproc_xi,iproc_eta,ichunk,addressing, &
             iboolleft_xi_outer_core,iboolright_xi_outer_core,iboolleft_eta_outer_core,iboolright_eta_outer_core, &
             npoin2D_faces_outer_core,npoin2D_xi_outer_core,npoin2D_eta_outer_core, &
             iboolfaces_outer_core,iboolcorner_outer_core, &
-            iprocfrom_faces,iprocto_faces,imsg_type, &
+            iprocfrom_faces,iprocto_faces, &
             iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
-            buffer_send_faces,buffer_received_faces,npoin2D_max_all, &
+            buffer_send_faces,buffer_received_faces,npoin2D_xi_outer_core(1), &
             buffer_send_chunkcorners_scalar,buffer_recv_chunkcorners_scalar, &
-            NUMMSGS_FACES,NUM_MSG_TYPES,NCORNERSCHUNKS, &
+            NUMMSGS_FACES,NCORNERSCHUNKS, &
             NPROC_XI,NPROC_ETA,NGLOB1D_RADIAL(IREGION_OUTER_CORE), &
             NGLOB2DMAX_XMIN_XMAX(IREGION_OUTER_CORE),NGLOB2DMAX_YMIN_YMAX(IREGION_OUTER_CORE),NGLOB2DMAX_XY_VAL_OC,NCHUNKS,iphase)
     enddo
@@ -2256,11 +2272,14 @@
             iboolleft_xi_inner_core,iboolright_xi_inner_core,iboolleft_eta_inner_core,iboolright_eta_inner_core, &
             npoin2D_faces_inner_core,npoin2D_xi_inner_core,npoin2D_eta_inner_core, &
             iboolfaces_inner_core,iboolcorner_inner_core, &
-            iprocfrom_faces,iprocto_faces,imsg_type, &
+            iprocfrom_faces,iprocto_faces, &
             iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
             buffer_send_faces,buffer_received_faces,npoin2D_max_all, &
-            buffer_send_chunkcorners_vector,buffer_recv_chunkcorners_vector, &
-            NUM_MSG_TYPES,iphase, &
+            buffer_send_chunkcorners_vector,buffer_recv_chunkcorners_vector,iphase, &
+!!!!!!!!!!!!!!!!!!!!!!!!!
+          nb_msgs_theor_in_cube,sender_from_slices_to_cube, &
+          npoin2D_cube_from_slices,buffer_all_cube_from_slices,buffer_slices,ibool_central_cube, &
+          receiver_cube_from_slices,ibelm_bottom_inner_core,NSPEC2D_BOTTOM(IREGION_INNER_CORE),INCLUDE_CENTRAL_CUBE,iphase_CC, &
 #endif
           COMPUTE_AND_STORE_STRAIN,AM_V,icall)
 
@@ -2426,7 +2445,17 @@
 ! in order to reduce the number of MPI messages by 2
 #ifdef USE_MPI
   if(USE_NONBLOCKING_COMMS) then
-    iphase = 1 ! start the non blocking communications
+
+!   if(INCLUDE_CENTRAL_CUBE) then
+        iphase_CC = 1 ! initialize the non blocking communication counter for the central cube
+!       call assemble_MPI_central_cube(ichunk,nb_msgs_theor_in_cube,sender_from_slices_to_cube, &
+!         npoin2D_cube_from_slices,buffer_all_cube_from_slices,buffer_slices,ibool_central_cube, &
+!         receiver_cube_from_slices,ibool_inner_core,idoubling_inner_core, &
+!         ibelm_bottom_inner_core,NSPEC2D_BOTTOM(IREGION_INNER_CORE),accel_inner_core,NDIM,iphase_CC)
+!   endif
+
+    iphase = 1 ! initialize the non blocking communication counter
+! start the non blocking communications
     call assemble_MPI_vector(myrank,accel_crust_mantle,accel_inner_core, &
             iproc_xi,iproc_eta,ichunk,addressing, &
             iboolleft_xi_crust_mantle,iboolright_xi_crust_mantle,iboolleft_eta_crust_mantle,iboolright_eta_crust_mantle, &
@@ -2435,11 +2464,11 @@
             iboolleft_xi_inner_core,iboolright_xi_inner_core,iboolleft_eta_inner_core,iboolright_eta_inner_core, &
             npoin2D_faces_inner_core,npoin2D_xi_inner_core(1),npoin2D_eta_inner_core(1), &
             iboolfaces_inner_core,iboolcorner_inner_core, &
-            iprocfrom_faces,iprocto_faces,imsg_type, &
+            iprocfrom_faces,iprocto_faces, &
             iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
             buffer_send_faces,buffer_received_faces,npoin2D_max_all, &
             buffer_send_chunkcorners_vector,buffer_recv_chunkcorners_vector, &
-            NUMMSGS_FACES,NUM_MSG_TYPES,NCORNERSCHUNKS, &
+            NUMMSGS_FACES,NCORNERSCHUNKS, &
             NPROC_XI,NPROC_ETA,NGLOB1D_RADIAL(IREGION_CRUST_MANTLE), &
             NGLOB1D_RADIAL(IREGION_INNER_CORE),NCHUNKS,iphase)
   endif
@@ -2477,11 +2506,14 @@
             iboolleft_xi_inner_core,iboolright_xi_inner_core,iboolleft_eta_inner_core,iboolright_eta_inner_core, &
             npoin2D_faces_inner_core,npoin2D_xi_inner_core,npoin2D_eta_inner_core, &
             iboolfaces_inner_core,iboolcorner_inner_core, &
-            iprocfrom_faces,iprocto_faces,imsg_type, &
+            iprocfrom_faces,iprocto_faces, &
             iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
             buffer_send_faces,buffer_received_faces,npoin2D_max_all, &
-            buffer_send_chunkcorners_vector,buffer_recv_chunkcorners_vector, &
-            NUM_MSG_TYPES,iphase, &
+            buffer_send_chunkcorners_vector,buffer_recv_chunkcorners_vector,iphase, &
+!!!!!!!!!!!!!!!!!!!!!!!!!
+          nb_msgs_theor_in_cube,sender_from_slices_to_cube, &
+          npoin2D_cube_from_slices,buffer_all_cube_from_slices,buffer_slices,ibool_central_cube, &
+          receiver_cube_from_slices,ibelm_bottom_inner_core,NSPEC2D_BOTTOM(IREGION_INNER_CORE),INCLUDE_CENTRAL_CUBE,iphase_CC, &
           COMPUTE_AND_STORE_STRAIN,AM_V,icall)
   endif
 #endif
@@ -2509,7 +2541,7 @@
 ! crust/mantle and inner core handled in the same call
 ! in order to reduce the number of MPI messages by 2
   if(USE_NONBLOCKING_COMMS) then
-    do while (iphase <= 5) ! make sure the last communications are finished and processed
+    do while (iphase <= 7) ! make sure the last communications are finished and processed
       call assemble_MPI_vector(myrank,accel_crust_mantle,accel_inner_core, &
             iproc_xi,iproc_eta,ichunk,addressing, &
             iboolleft_xi_crust_mantle,iboolright_xi_crust_mantle,iboolleft_eta_crust_mantle,iboolright_eta_crust_mantle, &
@@ -2518,11 +2550,11 @@
             iboolleft_xi_inner_core,iboolright_xi_inner_core,iboolleft_eta_inner_core,iboolright_eta_inner_core, &
             npoin2D_faces_inner_core,npoin2D_xi_inner_core(1),npoin2D_eta_inner_core(1), &
             iboolfaces_inner_core,iboolcorner_inner_core, &
-            iprocfrom_faces,iprocto_faces,imsg_type, &
+            iprocfrom_faces,iprocto_faces, &
             iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
             buffer_send_faces,buffer_received_faces,npoin2D_max_all, &
             buffer_send_chunkcorners_vector,buffer_recv_chunkcorners_vector, &
-            NUMMSGS_FACES,NUM_MSG_TYPES,NCORNERSCHUNKS, &
+            NUMMSGS_FACES,NCORNERSCHUNKS, &
             NPROC_XI,NPROC_ETA,NGLOB1D_RADIAL(IREGION_CRUST_MANTLE), &
             NGLOB1D_RADIAL(IREGION_INNER_CORE),NCHUNKS,iphase)
     enddo
@@ -2551,10 +2583,19 @@
 
 #ifdef USE_MPI
   if(INCLUDE_CENTRAL_CUBE) then
-   call assemble_MPI_central_cube(ichunk,nb_msgs_theor_in_cube, sender_from_slices_to_cube, &
-     npoin2D_cube_from_slices, buffer_all_cube_from_slices, buffer_slices, buffer_slices2, ibool_central_cube, &
-     receiver_cube_from_slices, ibool_inner_core, idoubling_inner_core, NSPEC_INNER_CORE, &
-     ibelm_bottom_inner_core, NSPEC2D_BOTTOM(IREGION_INNER_CORE),NGLOB_INNER_CORE,accel_inner_core,NDIM)
+    if(USE_NONBLOCKING_COMMS) then
+      do while (iphase_CC <= 4) ! make sure the last communications are finished and processed
+        call assemble_MPI_central_cube(ichunk,nb_msgs_theor_in_cube,sender_from_slices_to_cube, &
+          npoin2D_cube_from_slices,buffer_all_cube_from_slices,buffer_slices,ibool_central_cube, &
+          receiver_cube_from_slices,ibool_inner_core,idoubling_inner_core, &
+          ibelm_bottom_inner_core,NSPEC2D_BOTTOM(IREGION_INNER_CORE),accel_inner_core,NDIM,iphase_CC)
+      enddo
+    else
+      call assemble_MPI_central_cube_block(ichunk,nb_msgs_theor_in_cube,sender_from_slices_to_cube, &
+        npoin2D_cube_from_slices,buffer_all_cube_from_slices,buffer_slices,buffer_slices2,ibool_central_cube, &
+        receiver_cube_from_slices,ibool_inner_core,idoubling_inner_core,NSPEC_INNER_CORE, &
+        ibelm_bottom_inner_core,NSPEC2D_BOTTOM(IREGION_INNER_CORE),NGLOB_INNER_CORE,accel_inner_core,NDIM)
+    endif
   endif   ! end of assembling forces with the central cube
 #endif