[cig-commits] r22991 - in seismo/3D/SPECFEM3D_GLOBE/trunk: . setup src/auxiliaries src/cuda src/meshfem3D src/shared src/specfem3D utils/Visualization/VTK_ParaView

Mon Jan 13 08:06:21 PST 2014

Author: danielpeter
Date: 2014-01-13 08:06:21 -0800 (Mon, 13 Jan 2014)
New Revision: 22991

Modified:
   seismo/3D/SPECFEM3D_GLOBE/trunk/Makefile.in
   seismo/3D/SPECFEM3D_GLOBE/trunk/setup/constants.h.in
   seismo/3D/SPECFEM3D_GLOBE/trunk/src/auxiliaries/combine_vol_data.F90
   seismo/3D/SPECFEM3D_GLOBE/trunk/src/auxiliaries/combine_vol_data_adios_impl.f90
   seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/assemble_MPI_scalar_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/assemble_MPI_vector_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/check_fields_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/compute_add_sources_elastic_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/compute_coupling_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/compute_forces_crust_mantle_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/compute_forces_inner_core_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/compute_forces_outer_core_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/compute_kernels_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/compute_stacey_acoustic_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/compute_stacey_elastic_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/initialize_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/mesh_constants_cuda.h
   seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/prepare_mesh_constants_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/specfem3D_gpu_cuda_method_stubs.c
   seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/update_displacement_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/write_seismograms_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/trunk/src/meshfem3D/model_crust.f90
   seismo/3D/SPECFEM3D_GLOBE/trunk/src/meshfem3D/rules.mk
   seismo/3D/SPECFEM3D_GLOBE/trunk/src/shared/write_VTK_file.f90
   seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/assemble_MPI_scalar.f90
   seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/assemble_MPI_vector.f90
   seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_add_sources.f90
   seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_forces_acoustic_calling_routine.F90
   seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_forces_viscoelastic_calling_routine.F90
   seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_stacey_outer_core.f90
   seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/get_attenuation.f90
   seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/locate_receivers.f90
   seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/locate_sources.f90
   seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/setup_sources_receivers.f90
   seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/write_seismograms.f90
   seismo/3D/SPECFEM3D_GLOBE/trunk/utils/Visualization/VTK_ParaView/create_slice_VTK.f90
Log:
adds asynchronuous memory copies between CPU and GPU; bug fix in adjoint sources on GPU; additional tweaks in getting values for crust2.0 arrays; adds explicit output format for VTK cell types

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/Makefile.in
===================================================================

--- seismo/3D/SPECFEM3D_GLOBE/trunk/Makefile.in	2014-01-06 21:22:48 UTC (rev 22990)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/Makefile.in	2014-01-13 16:06:21 UTC (rev 22991)
@@ -55,7 +55,7 @@
 #######################################
 ####
 #### CUDA
-#### with configure: ./configure --with-cuda CUDA_LIB=.. CUDA_INC=.. MPI_INC=.. ..
+#### with configure: ./configure --with-cuda=cuda5 CUDA_FLAGS=.. CUDA_LIB=.. CUDA_INC=.. MPI_INC=.. ..
 ####
 #######################################
 
@@ -66,7 +66,8 @@
 @COND_CUDA5_FALSE at CUDA5 = no
 
 MPI_INCLUDES = @MPI_INCLUDES@
-CUDA_INC = @CUDA_FLAGS@ @CUDA_CPPFLAGS@ -I${SETUP}
+CUDA_FLAGS = @CUDA_FLAGS@
+CUDA_INC = @CUDA_CPPFLAGS@ -I${SETUP}
 CUDA_LINK = @CUDA_LDFLAGS@ @CUDA_LIBS@
 
 @COND_CUDA_TRUE at NVCC = nvcc
@@ -89,7 +90,7 @@
 @COND_CUDA_TRUE@@COND_CUDA5_FALSE at GENCODE = $(GENCODE_20)
 
 # CUDA flags and linking
- at COND_CUDA_TRUE@NVCC_FLAGS_BASE = $(CUDA_INC) $(MPI_INCLUDES)
+ at COND_CUDA_TRUE@NVCC_FLAGS_BASE = $(CUDA_FLAGS) $(CUDA_INC) $(MPI_INCLUDES)
 @COND_CUDA_TRUE@@COND_CUDA5_TRUE at NVCC_FLAGS = $(NVCC_FLAGS_BASE) -dc $(GENCODE)
 @COND_CUDA_TRUE@@COND_CUDA5_FALSE at NVCC_FLAGS = $(NVCC_FLAGS_BASE) -DUSE_OLDER_CUDA4_GPU $(GENCODE)
 

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/setup/constants.h.in
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/setup/constants.h.in	2014-01-06 21:22:48 UTC (rev 22990)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/setup/constants.h.in	2014-01-13 16:06:21 UTC (rev 22991)
@@ -236,19 +236,25 @@
 ! the mesher) and use them for the computation of boundary kernel (in the solver)
   logical, parameter :: SAVE_BOUNDARY_MESH = .false.
 
-!************************************************************
-! added these parameters for the GPU version of the solver with mesh coloring
+!!-----------------------------------------------------------
+!!
+!! GPU optimization
+!!
+!!-----------------------------------------------------------
+! added these parameters for the GPU version of the solver
 
+! asynchronuous memcopy between CPU and GPU
+  logical, parameter :: GPU_ASYNC_COPY = .true.
+
+! mesh coloring
 ! add mesh coloring for the GPU + MPI implementation
   logical, parameter :: USE_MESH_COLORING_GPU = .false.
   integer, parameter :: MAX_NUMBER_OF_COLORS = 1000
-
 ! enhanced coloring:
 ! using Droux algorithm
 ! try several times with one more color before giving up
   logical, parameter :: USE_DROUX_OPTIMIZATION = .false.
   integer, parameter :: MAX_NB_TRIES_OF_DROUX_1993 = 15
-
 ! using balancing algorithm
 ! postprocess the colors to balance them if Droux (1993) algorithm is not used
   logical, parameter :: BALANCE_COLORS_SIMPLE_ALGO = .true.

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/auxiliaries/combine_vol_data.F90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/auxiliaries/combine_vol_data.F90	2014-01-06 21:22:48 UTC (rev 22990)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/auxiliaries/combine_vol_data.F90	2014-01-13 16:06:21 UTC (rev 22991)
@@ -114,7 +114,7 @@
   print  *, sizeprocs, "procs"
   if (sizeprocs .ne. 1) then
     print *, "sequential program. Only mpirun -np 1 ..."
-    call MPI_Abort(MPI_COMM_WORLD, mpier, ierr)  
+    call MPI_Abort(MPI_COMM_WORLD, mpier, ierr)
   endif
 #endif
 
@@ -357,7 +357,7 @@
       print *
 
       ! topology file
-#ifndef ADIOS_INPUT      
+#ifndef ADIOS_INPUT
       topo_file = trim(prname_topo) // 'solver_data.bin'
       open(unit = 28,file = trim(topo_file),status='old',action='read', iostat = ios, form='unformatted')
       if (ios /= 0) then
@@ -461,7 +461,7 @@
                   total_dat_xyz(1,np+numpoin) = x
                   total_dat_xyz(2,np+numpoin) = y
                   total_dat_xyz(3,np+numpoin) = z
-#endif                  
+#endif
                   mask_ibool(iglob) = .true.
                   num_ibool(iglob) = numpoin
                 endif
@@ -521,7 +521,7 @@
           if( idoubling_inner_core(ispec) == IFLAG_IN_FICTITIOUS_CUBE) then
             ! connectivity must be given, otherwise element count would be wrong
             ! maps "fictitious" connectivity, element is all with iglob = 1
-#ifndef USE_VTK_INSTEAD_OF_MESH            
+#ifndef USE_VTK_INSTEAD_OF_MESH
             do k = 1, NGLLZ-1, dk
               do j = 1, NGLLY-1, dj
                 do i = 1, NGLLX-1, di
@@ -635,7 +635,7 @@
     ! VTK
     ! type: hexahedrons
     write(IOVTK,'(a,i12)') "CELL_TYPES ",ne
-    write(IOVTK,*) (12,it=1,ne)
+    write(IOVTK,'(6i12)') (12,it=1,ne)
     write(IOVTK,*) ""
 
     write(IOVTK,'(a,i12)') "POINT_DATA ",np

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/auxiliaries/combine_vol_data_adios_impl.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/auxiliaries/combine_vol_data_adios_impl.f90	2014-01-06 21:22:48 UTC (rev 22990)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/auxiliaries/combine_vol_data_adios_impl.f90	2014-01-13 16:06:21 UTC (rev 22991)
@@ -6,7 +6,7 @@
 contains
 
 !=============================================================================
-!> Print help message. 
+!> Print help message.
 subroutine print_usage_adios()
   print *, 'Usage: '
   print *, '   xcombine_data slice_list varname var_file mesh_file ' // &
@@ -73,7 +73,7 @@
   endif
 
   iregion = 0
-  if (command_argument_count() == 7) then 
+  if (command_argument_count() == 7) then
     read(arg(7),*) iregion
   endif
   if (iregion > 3 .or. iregion < 0) stop 'Iregion = 0,1,2,3'
@@ -89,11 +89,11 @@
 
 
 !=============================================================================
-!> Open ADIOS value and mesh files, read mode 
+!> Open ADIOS value and mesh files, read mode
 subroutine init_adios(value_file_name, mesh_file_name, &
                       value_handle, mesh_handle)
   implicit none
-  ! Parameters 
+  ! Parameters
   character(len=*), intent(in) :: value_file_name, mesh_file_name
   integer(kind=8), intent(out) :: value_handle, mesh_handle
   ! Variables
@@ -109,10 +109,10 @@
 
 
 !=============================================================================
-!> Open ADIOS value and mesh files, read mode 
+!> Open ADIOS value and mesh files, read mode
 subroutine clean_adios(value_handle, mesh_handle)
   implicit none
-  ! Parameters 
+  ! Parameters
   integer(kind=8), intent(in) :: value_handle, mesh_handle
   ! Variables
   integer :: ier
@@ -142,18 +142,18 @@
                            0, 1, nglob, ier)
   call adios_schedule_read(mesh_handle, sel, trim(reg_name) // "/nspec", &
                            0, 1, nspec, ier)
-  call adios_perform_reads(mesh_handle, ier) 
+  call adios_perform_reads(mesh_handle, ier)
 end subroutine read_scalars_adios_mesh
 
 
 !=============================================================================
-subroutine read_coordinates_adios_mesh(mesh_handle, iproc, ir, nglob, nspec, & 
+subroutine read_coordinates_adios_mesh(mesh_handle, iproc, ir, nglob, nspec, &
                                        xstore, ystore, zstore, ibool)
   implicit none
   include 'constants.h'
   ! Parameters
   integer(kind=8), intent(in) :: mesh_handle
-  integer, intent(in) :: iproc, ir, nglob, nspec 
+  integer, intent(in) :: iproc, ir, nglob, nspec
   real(kind=CUSTOM_REAL),dimension(:), intent(inout) :: xstore, ystore, zstore
   integer, dimension(:,:,:,:), intent(inout) :: ibool
   ! Variables
@@ -171,7 +171,7 @@
   call adios_schedule_read(mesh_handle, sel_scalar,             &
                            trim(reg_name) // "x_global/offset", &
                            0, 1, offset_coord, ier)
-  call adios_perform_reads(mesh_handle, ier) 
+  call adios_perform_reads(mesh_handle, ier)
 
   start(1) = offset_ibool
   count_ad(1) = NGLLX * NGLLY * NGLLZ * nspec
@@ -180,7 +180,7 @@
                            trim(reg_name) // "ibool/array", 0, 1, &
                            ibool, ier)
 
-  start(1) = offset_coord 
+  start(1) = offset_coord
   count_ad(1) = nglob
   call adios_selection_boundingbox (sel_coord , 1, start, count_ad)
   call adios_schedule_read(mesh_handle, sel_coord, &
@@ -192,7 +192,7 @@
   call adios_schedule_read(mesh_handle, sel_coord, &
                            trim(reg_name) // "z_global/array", 0, 1, &
                            zstore, ier)
-  call adios_perform_reads(mesh_handle, ier) 
+  call adios_perform_reads(mesh_handle, ier)
 end subroutine read_coordinates_adios_mesh
 
 
@@ -202,7 +202,7 @@
   implicit none
   include 'constants.h'
   ! Parameters
-  integer(kind=8), intent(in) :: value_handle 
+  integer(kind=8), intent(in) :: value_handle
   character(len=*), intent(in) :: var_name
   integer, intent(in) :: iproc, ir, nspec
   real(kind=CUSTOM_REAL), dimension(:,:,:,:), intent(inout) :: data
@@ -218,7 +218,7 @@
   call adios_schedule_read(value_handle, sel,                             &
                            trim(reg_name) // trim(var_name) // "/offset", &
                            0, 1, offset, ier)
-  call adios_perform_reads(value_handle, ier) 
+  call adios_perform_reads(value_handle, ier)
 
   start(1) = offset
   count_ad(1) = NGLLX * NGLLY * NGLLZ * nspec
@@ -226,7 +226,7 @@
   call adios_schedule_read(value_handle, sel, &
                            trim(reg_name) // trim(var_name) // "/array", 0, 1,&
                            data, ier)
-  call adios_perform_reads(value_handle, ier) 
+  call adios_perform_reads(value_handle, ier)
 end subroutine read_values_adios
 
 end module combine_vol_data_adios_mod

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/assemble_MPI_scalar_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/assemble_MPI_scalar_cuda.cu	2014-01-06 21:22:48 UTC (rev 22990)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/assemble_MPI_scalar_cuda.cu	2014-01-13 16:06:21 UTC (rev 22991)
@@ -63,22 +63,31 @@
 
 }
 
+/* ----------------------------------------------------------------------------------------------- */
 
+// MPI transfer
+
 /* ----------------------------------------------------------------------------------------------- */
 
 // prepares and transfers the inter-element edge-nodes to the host to be MPI'd
+// (elements on boundary)
+
 extern "C"
 void FC_FUNC_(transfer_boun_pot_from_device,
               TRANSFER_BOUN_POT_FROM_DEVICE)(long* Mesh_pointer_f,
-                                             realw* send_potential_dot_dot_buffer,
+                                             realw* send_buffer,
                                              int* FORWARD_OR_ADJOINT){
 
   TRACE("transfer_boun_pot_from_device");
+  int size_mpi_buffer;
 
   Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
 
+  // mpi buffer size
+  size_mpi_buffer = (mp->max_nibool_interfaces_oc)*(mp->num_interfaces_outer_core);
+
   // checks if anything to do
-  if( mp->num_interfaces_outer_core == 0 ) return;
+  if( size_mpi_buffer <= 0 ) return;
 
   int blocksize = BLOCKSIZE_TRANSFER;
   int size_padded = ((int)ceil(((double)(mp->max_nibool_interfaces_oc))/((double)blocksize)))*blocksize;
@@ -90,41 +99,65 @@
   dim3 threads(blocksize,1,1);
 
   if(*FORWARD_OR_ADJOINT == 1) {
-    prepare_boundary_potential_on_device<<<grid,threads>>>(mp->d_accel_outer_core,
-                                                           mp->d_send_accel_buffer_outer_core,
-                                                           mp->num_interfaces_outer_core,
-                                                           mp->max_nibool_interfaces_oc,
-                                                           mp->d_nibool_interfaces_outer_core,
-                                                           mp->d_ibool_interfaces_outer_core);
+    prepare_boundary_potential_on_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_accel_outer_core,
+                                                                                mp->d_send_accel_buffer_outer_core,
+                                                                                mp->num_interfaces_outer_core,
+                                                                                mp->max_nibool_interfaces_oc,
+                                                                                mp->d_nibool_interfaces_outer_core,
+                                                                                mp->d_ibool_interfaces_outer_core);
 
-    print_CUDA_error_if_any(cudaMemcpy(send_potential_dot_dot_buffer,mp->d_send_accel_buffer_outer_core,
-                                       (mp->max_nibool_interfaces_oc)*(mp->num_interfaces_outer_core)*sizeof(realw),
+    // copies buffer to CPU
+    if( GPU_ASYNC_COPY ){
+      // waits until kernel is finished before starting async memcpy
+      cudaStreamSynchronize(mp->compute_stream);
+      // copies buffer to CPU
+      cudaMemcpyAsync(mp->h_send_accel_buffer_oc,mp->d_send_accel_buffer_outer_core,size_mpi_buffer*sizeof(realw),
+                      cudaMemcpyDeviceToHost,mp->copy_stream);
+    }else{
+      // synchronuous copy
+      print_CUDA_error_if_any(cudaMemcpy(send_buffer,mp->d_send_accel_buffer_outer_core,
+                                       size_mpi_buffer*sizeof(realw),
                                        cudaMemcpyDeviceToHost),98000);
+    }
 
   }
   else if(*FORWARD_OR_ADJOINT == 3) {
     // debug
     DEBUG_BACKWARD_ASSEMBLY();
 
-    prepare_boundary_potential_on_device<<<grid,threads>>>(mp->d_b_accel_outer_core,
-                                                           mp->d_b_send_accel_buffer_outer_core,
-                                                           mp->num_interfaces_outer_core,
-                                                           mp->max_nibool_interfaces_oc,
-                                                           mp->d_nibool_interfaces_outer_core,
-                                                           mp->d_ibool_interfaces_outer_core);
+    prepare_boundary_potential_on_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_accel_outer_core,
+                                                                                mp->d_b_send_accel_buffer_outer_core,
+                                                                                mp->num_interfaces_outer_core,
+                                                                                mp->max_nibool_interfaces_oc,
+                                                                                mp->d_nibool_interfaces_outer_core,
+                                                                                mp->d_ibool_interfaces_outer_core);
 
-    print_CUDA_error_if_any(cudaMemcpy(send_potential_dot_dot_buffer,mp->d_b_send_accel_buffer_outer_core,
-                                       (mp->max_nibool_interfaces_oc)*(mp->num_interfaces_outer_core)*sizeof(realw),
+
+    // copies buffer to CPU
+    if( GPU_ASYNC_COPY ){
+      // waits until kernel is finished before starting async memcpy
+      cudaStreamSynchronize(mp->compute_stream);
+      // copies buffer to CPU
+      cudaMemcpyAsync(mp->h_b_send_accel_buffer_oc,mp->d_b_send_accel_buffer_outer_core,size_mpi_buffer*sizeof(realw),
+                      cudaMemcpyDeviceToHost,mp->copy_stream);
+    }else{
+      // synchronuous copy
+      print_CUDA_error_if_any(cudaMemcpy(send_buffer,mp->d_b_send_accel_buffer_outer_core,
+                                       size_mpi_buffer*sizeof(realw),
                                        cudaMemcpyDeviceToHost),98001);
+    }
 
   }
 
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-  exit_on_cuda_error("prepare_boundary_kernel");
+  exit_on_cuda_error("transfer_boun_pot_from_device");
 #endif
 }
 
+/* ----------------------------------------------------------------------------------------------- */
 
+// ASSEMBLY
+
 /* ----------------------------------------------------------------------------------------------- */
 
 
@@ -161,12 +194,15 @@
                                             int* FORWARD_OR_ADJOINT) {
 
   TRACE("transfer_asmbl_pot_to_device");
+  int size_mpi_buffer;
 
   Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
-  //double start_time = get_time();
 
+  // buffer size
+  size_mpi_buffer = (mp->max_nibool_interfaces_oc)*(mp->num_interfaces_outer_core);
+
   // checks if anything to do
-  if( mp->num_interfaces_outer_core == 0 ) return;
+  if( size_mpi_buffer <= 0 ) return;
 
   // assembles on GPU
   int blocksize = BLOCKSIZE_TRANSFER;
@@ -179,35 +215,46 @@
   dim3 threads(blocksize,1,1);
 
   if(*FORWARD_OR_ADJOINT == 1) {
-    // copies scalar buffer onto GPU
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_send_accel_buffer_outer_core, buffer_recv_scalar,
-                                       (mp->max_nibool_interfaces_oc)*(mp->num_interfaces_outer_core)*sizeof(realw),
-                                       cudaMemcpyHostToDevice),99000);
 
-    //assemble forward field
-    assemble_boundary_potential_on_device<<<grid,threads>>>(mp->d_accel_outer_core,
-                                                            mp->d_send_accel_buffer_outer_core,
-                                                            mp->num_interfaces_outer_core,
-                                                            mp->max_nibool_interfaces_oc,
-                                                            mp->d_nibool_interfaces_outer_core,
-                                                            mp->d_ibool_interfaces_outer_core);
-  }
-  else if(*FORWARD_OR_ADJOINT == 3) {
+    // asynchronuous copy
+    if( GPU_ASYNC_COPY ){
+      // Wait until previous copy stream finishes. We assemble while other compute kernels execute.
+      cudaStreamSynchronize(mp->copy_stream);
+    }else{
+      // copies scalar buffer onto GPU
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_send_accel_buffer_outer_core, buffer_recv_scalar,size_mpi_buffer*sizeof(realw),
+                                         cudaMemcpyHostToDevice),99000);
+    }
+
+    //assembles forward field
+    assemble_boundary_potential_on_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_accel_outer_core,
+                                                                                 mp->d_send_accel_buffer_outer_core,
+                                                                                 mp->num_interfaces_outer_core,
+                                                                                 mp->max_nibool_interfaces_oc,
+                                                                                 mp->d_nibool_interfaces_outer_core,
+                                                                                 mp->d_ibool_interfaces_outer_core);
+  }else if(*FORWARD_OR_ADJOINT == 3) {
     // debug
     DEBUG_BACKWARD_ASSEMBLY();
 
-    // copies scalar buffer onto GPU
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_send_accel_buffer_outer_core, buffer_recv_scalar,
-                                       (mp->max_nibool_interfaces_oc)*(mp->num_interfaces_outer_core)*sizeof(realw),
-                                       cudaMemcpyHostToDevice),99001);
+    // asynchronuous copy
+    if( GPU_ASYNC_COPY ){
+      // Wait until previous copy stream finishes. We assemble while other compute kernels execute.
+      cudaStreamSynchronize(mp->copy_stream);
+    }else{
+      // (cudaMemcpy implicitly synchronizes all other cuda operations)
+      // copies scalar buffer onto GPU
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_b_send_accel_buffer_outer_core, buffer_recv_scalar,size_mpi_buffer*sizeof(realw),
+                                         cudaMemcpyHostToDevice),99001);
+    }
 
-    //assemble reconstructed/backward field
-    assemble_boundary_potential_on_device<<<grid,threads>>>(mp->d_b_accel_outer_core,
-                                                            mp->d_b_send_accel_buffer_outer_core,
-                                                            mp->num_interfaces_outer_core,
-                                                            mp->max_nibool_interfaces_oc,
-                                                            mp->d_nibool_interfaces_outer_core,
-                                                            mp->d_ibool_interfaces_outer_core);
+    //assembles reconstructed/backward field
+    assemble_boundary_potential_on_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_accel_outer_core,
+                                                                                 mp->d_b_send_accel_buffer_outer_core,
+                                                                                 mp->num_interfaces_outer_core,
+                                                                                 mp->max_nibool_interfaces_oc,
+                                                                                 mp->d_nibool_interfaces_outer_core,
+                                                                                 mp->d_ibool_interfaces_outer_core);
   }
 
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/assemble_MPI_vector_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/assemble_MPI_vector_cuda.cu	2014-01-06 21:22:48 UTC (rev 22990)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/assemble_MPI_vector_cuda.cu	2014-01-13 16:06:21 UTC (rev 22991)
@@ -66,26 +66,35 @@
 
 /* ----------------------------------------------------------------------------------------------- */
 
+// MPI transfer
+
+/* ----------------------------------------------------------------------------------------------- */
+
 // prepares and transfers the inter-element edge-nodes to the host to be MPI'd
 // (elements on boundary)
+
 extern "C"
-void FC_FUNC_(transfer_boun_accel_from_device,
-              TRANSFER_BOUN_ACCEL_FROM_DEVICE)(long* Mesh_pointer_f,
-                                               realw* send_accel_buffer,
-                                               int* IREGION,
-                                               int* FORWARD_OR_ADJOINT){
-  TRACE("transfer_boun_accel_from_device");
+void FC_FUNC_(transfer_boun_from_device,
+              TRANSFER_BOUN_FROM_DEVICE)(long* Mesh_pointer_f,
+                                         realw* send_accel_buffer,
+                                         int* IREGION,
+                                         int* FORWARD_OR_ADJOINT){
+  TRACE("transfer_boun_from_device");
 
   int blocksize,size_padded;
   int num_blocks_x,num_blocks_y;
   dim3 grid,threads;
+  int size_mpi_buffer;
 
   Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
 
   // crust/mantle region
   if( *IREGION == IREGION_CRUST_MANTLE ){
-    if( mp->num_interfaces_crust_mantle > 0 ){
 
+    size_mpi_buffer = NDIM*mp->max_nibool_interfaces_cm*mp->num_interfaces_crust_mantle;
+
+    if( size_mpi_buffer > 0 ){
+
       blocksize = BLOCKSIZE_TRANSFER;
       size_padded = ((int)ceil(((double)mp->max_nibool_interfaces_cm)/((double)blocksize)))*blocksize;
 
@@ -95,44 +104,60 @@
       threads = dim3(blocksize,1,1);
 
       if(*FORWARD_OR_ADJOINT == 1) {
-        prepare_boundary_accel_on_device<<<grid,threads>>>(mp->d_accel_crust_mantle,
-                                                           mp->d_send_accel_buffer_crust_mantle,
-                                                           mp->num_interfaces_crust_mantle,
-                                                           mp->max_nibool_interfaces_cm,
-                                                           mp->d_nibool_interfaces_crust_mantle,
-                                                           mp->d_ibool_interfaces_crust_mantle);
+        prepare_boundary_accel_on_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_accel_crust_mantle,
+                                                                                mp->d_send_accel_buffer_crust_mantle,
+                                                                                mp->num_interfaces_crust_mantle,
+                                                                                mp->max_nibool_interfaces_cm,
+                                                                                mp->d_nibool_interfaces_crust_mantle,
+                                                                                mp->d_ibool_interfaces_crust_mantle);
 
         // copies buffer to CPU
-        print_CUDA_error_if_any(cudaMemcpy(send_accel_buffer,mp->d_send_accel_buffer_crust_mantle,
-                                           NDIM*mp->max_nibool_interfaces_cm*mp->num_interfaces_crust_mantle*sizeof(realw),
-                                           cudaMemcpyDeviceToHost),41000);
-
+        if( GPU_ASYNC_COPY ){
+          // waits until kernel is finished before starting async memcpy
+          cudaStreamSynchronize(mp->compute_stream);
+          // copies buffer to CPU
+          cudaMemcpyAsync(mp->h_send_accel_buffer_cm,mp->d_send_accel_buffer_crust_mantle,size_mpi_buffer*sizeof(realw),
+                          cudaMemcpyDeviceToHost,mp->copy_stream);
+        }else{
+          // synchronuous copy
+          print_CUDA_error_if_any(cudaMemcpy(send_accel_buffer,mp->d_send_accel_buffer_crust_mantle,size_mpi_buffer*sizeof(realw),
+                                             cudaMemcpyDeviceToHost),41000);
+        }
       }
       else if(*FORWARD_OR_ADJOINT == 3) {
         // debug
         DEBUG_BACKWARD_ASSEMBLY();
 
-        prepare_boundary_accel_on_device<<<grid,threads>>>(mp->d_b_accel_crust_mantle,
-                                                           mp->d_b_send_accel_buffer_crust_mantle,
-                                                           mp->num_interfaces_crust_mantle,
-                                                           mp->max_nibool_interfaces_cm,
-                                                           mp->d_nibool_interfaces_crust_mantle,
-                                                           mp->d_ibool_interfaces_crust_mantle);
+        prepare_boundary_accel_on_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_accel_crust_mantle,
+                                                                                mp->d_b_send_accel_buffer_crust_mantle,
+                                                                                mp->num_interfaces_crust_mantle,
+                                                                                mp->max_nibool_interfaces_cm,
+                                                                                mp->d_nibool_interfaces_crust_mantle,
+                                                                                mp->d_ibool_interfaces_crust_mantle);
+
         // copies buffer to CPU
-        print_CUDA_error_if_any(cudaMemcpy(send_accel_buffer,mp->d_b_send_accel_buffer_crust_mantle,
-                                           NDIM*mp->max_nibool_interfaces_cm*mp->num_interfaces_crust_mantle*sizeof(realw),
-                                           cudaMemcpyDeviceToHost),41001);
-
+        if( GPU_ASYNC_COPY ){
+          // waits until kernel is finished before starting async memcpy
+          cudaStreamSynchronize(mp->compute_stream);
+          // copies buffer to CPU
+          cudaMemcpyAsync(mp->h_b_send_accel_buffer_cm,mp->d_b_send_accel_buffer_crust_mantle,size_mpi_buffer*sizeof(realw),
+                          cudaMemcpyDeviceToHost,mp->copy_stream);
+        }else{
+          // synchronuous copy
+          print_CUDA_error_if_any(cudaMemcpy(send_accel_buffer,mp->d_b_send_accel_buffer_crust_mantle,size_mpi_buffer*sizeof(realw),
+                                             cudaMemcpyDeviceToHost),41001);
+        }
       }
-
-
     }
   }
 
   // inner core region
   if( *IREGION == IREGION_INNER_CORE ){
-    if( mp->num_interfaces_inner_core > 0 ){
 
+    size_mpi_buffer = NDIM*mp->max_nibool_interfaces_ic*mp->num_interfaces_inner_core;
+
+    if( size_mpi_buffer > 0 ){
+
       blocksize = BLOCKSIZE_TRANSFER;
       size_padded = ((int)ceil(((double)mp->max_nibool_interfaces_ic)/((double)blocksize)))*blocksize;
 
@@ -142,45 +167,65 @@
       threads = dim3(blocksize,1,1);
 
       if(*FORWARD_OR_ADJOINT == 1) {
-        prepare_boundary_accel_on_device<<<grid,threads>>>(mp->d_accel_inner_core,
-                                                           mp->d_send_accel_buffer_inner_core,
-                                                           mp->num_interfaces_inner_core,
-                                                           mp->max_nibool_interfaces_ic,
-                                                           mp->d_nibool_interfaces_inner_core,
-                                                           mp->d_ibool_interfaces_inner_core);
+        prepare_boundary_accel_on_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_accel_inner_core,
+                                                                                mp->d_send_accel_buffer_inner_core,
+                                                                                mp->num_interfaces_inner_core,
+                                                                                mp->max_nibool_interfaces_ic,
+                                                                                mp->d_nibool_interfaces_inner_core,
+                                                                                mp->d_ibool_interfaces_inner_core);
 
         // copies buffer to CPU
-        print_CUDA_error_if_any(cudaMemcpy(send_accel_buffer,mp->d_send_accel_buffer_inner_core,
-                                           NDIM*mp->max_nibool_interfaces_ic*mp->num_interfaces_inner_core*sizeof(realw),
+        if( GPU_ASYNC_COPY ){
+          // waits until kernel is finished before starting async memcpy
+          cudaStreamSynchronize(mp->compute_stream);
+          // copies buffer to CPU
+          cudaMemcpyAsync(mp->h_send_accel_buffer_ic,mp->d_send_accel_buffer_inner_core,size_mpi_buffer*sizeof(realw),
+                          cudaMemcpyDeviceToHost,mp->copy_stream);
+        }else{
+          // synchronuous copy
+          print_CUDA_error_if_any(cudaMemcpy(send_accel_buffer,mp->d_send_accel_buffer_inner_core,size_mpi_buffer*sizeof(realw),
                                            cudaMemcpyDeviceToHost),41000);
-
+        }
       }
       else if(*FORWARD_OR_ADJOINT == 3) {
         // debug
         DEBUG_BACKWARD_ASSEMBLY();
 
-        prepare_boundary_accel_on_device<<<grid,threads>>>(mp->d_b_accel_inner_core,
-                                                           mp->d_b_send_accel_buffer_inner_core,
-                                                           mp->num_interfaces_inner_core,
-                                                           mp->max_nibool_interfaces_ic,
-                                                           mp->d_nibool_interfaces_inner_core,
-                                                           mp->d_ibool_interfaces_inner_core);
+        prepare_boundary_accel_on_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_accel_inner_core,
+                                                                                mp->d_b_send_accel_buffer_inner_core,
+                                                                                mp->num_interfaces_inner_core,
+                                                                                mp->max_nibool_interfaces_ic,
+                                                                                mp->d_nibool_interfaces_inner_core,
+                                                                                mp->d_ibool_interfaces_inner_core);
         // copies buffer to CPU
-        print_CUDA_error_if_any(cudaMemcpy(send_accel_buffer,mp->d_b_send_accel_buffer_inner_core,
-                                           NDIM*mp->max_nibool_interfaces_ic*mp->num_interfaces_inner_core*sizeof(realw),
+        if( GPU_ASYNC_COPY ){
+          // waits until kernel is finished before starting async memcpy
+          cudaStreamSynchronize(mp->compute_stream);
+          // copies buffer to CPU
+          cudaMemcpyAsync(mp->h_b_send_accel_buffer_ic,mp->d_b_send_accel_buffer_inner_core,size_mpi_buffer*sizeof(realw),
+                        cudaMemcpyDeviceToHost,mp->copy_stream);
+        }else{
+          // synchronuous copy
+          print_CUDA_error_if_any(cudaMemcpy(send_accel_buffer,mp->d_b_send_accel_buffer_inner_core,size_mpi_buffer*sizeof(realw),
                                            cudaMemcpyDeviceToHost),41001);
+        }
       }
 
     }
   }
 
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-  exit_on_cuda_error("transfer_boun_accel_from_device");
+  exit_on_cuda_error("transfer_boun_from_device");
 #endif
 }
 
+
 /* ----------------------------------------------------------------------------------------------- */
 
+// ASSEMBLY
+
+/* ----------------------------------------------------------------------------------------------- */
+
 __global__ void assemble_boundary_accel_on_device(realw* d_accel, realw* d_send_accel_buffer,
                                                   int num_interfaces,
                                                   int max_nibool_interfaces,
@@ -218,12 +263,16 @@
   int blocksize,size_padded;
   int num_blocks_x,num_blocks_y;
   dim3 grid,threads;
+  int size_mpi_buffer;
 
   Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
 
   // crust/mantle region
   if( *IREGION == IREGION_CRUST_MANTLE ){
-    if( mp->num_interfaces_crust_mantle > 0 ){
+
+    size_mpi_buffer = NDIM*(mp->max_nibool_interfaces_cm)*(mp->num_interfaces_crust_mantle);
+
+    if( size_mpi_buffer > 0 ){
       // assembles values
       blocksize = BLOCKSIZE_TRANSFER;
       size_padded = ((int)ceil(((double)mp->max_nibool_interfaces_cm)/((double)blocksize)))*blocksize;
@@ -234,30 +283,42 @@
       threads = dim3(blocksize,1,1);
 
       if(*FORWARD_OR_ADJOINT == 1) {
-        // copies vector buffer values to GPU
-        print_CUDA_error_if_any(cudaMemcpy(mp->d_send_accel_buffer_crust_mantle, buffer_recv_vector,
-                                           NDIM*(mp->max_nibool_interfaces_cm)*(mp->num_interfaces_crust_mantle)*sizeof(realw),
-                                           cudaMemcpyHostToDevice),41000);
 
+        // asynchronuous copy
+        if( GPU_ASYNC_COPY ){
+          // Wait until previous copy stream finishes. We assemble while other compute kernels execute.
+          cudaStreamSynchronize(mp->copy_stream);
+        }else{
+          // (cudaMemcpy implicitly synchronizes all other cuda operations)
+          // copies vector buffer values to GPU
+          print_CUDA_error_if_any(cudaMemcpy(mp->d_send_accel_buffer_crust_mantle,buffer_recv_vector,size_mpi_buffer*sizeof(realw),
+                                             cudaMemcpyHostToDevice),41000);
+        }
+
         //assemble forward accel
-        assemble_boundary_accel_on_device<<<grid,threads>>>(mp->d_accel_crust_mantle,
+        assemble_boundary_accel_on_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_accel_crust_mantle,
                                                             mp->d_send_accel_buffer_crust_mantle,
                                                             mp->num_interfaces_crust_mantle,
                                                             mp->max_nibool_interfaces_cm,
                                                             mp->d_nibool_interfaces_crust_mantle,
                                                             mp->d_ibool_interfaces_crust_mantle);
-      }
-      else if(*FORWARD_OR_ADJOINT == 3) {
+      }else if(*FORWARD_OR_ADJOINT == 3) {
         // debug
         DEBUG_BACKWARD_ASSEMBLY();
 
-        // copies vector buffer values to GPU
-        print_CUDA_error_if_any(cudaMemcpy(mp->d_b_send_accel_buffer_crust_mantle, buffer_recv_vector,
-                                           NDIM*(mp->max_nibool_interfaces_cm)*(mp->num_interfaces_crust_mantle)*sizeof(realw),
-                                           cudaMemcpyHostToDevice),41000);
+        // asynchronuous copy
+        if( GPU_ASYNC_COPY ){
+          // Wait until previous copy stream finishes. We assemble while other compute kernels execute.
+          cudaStreamSynchronize(mp->copy_stream);
+        }else{
+          // (cudaMemcpy implicitly synchronizes all other cuda operations)
+          // copies vector buffer values to GPU
+          print_CUDA_error_if_any(cudaMemcpy(mp->d_b_send_accel_buffer_crust_mantle,buffer_recv_vector,size_mpi_buffer*sizeof(realw),
+                                             cudaMemcpyHostToDevice),41000);
+        }
 
         //assemble adjoint accel
-        assemble_boundary_accel_on_device<<<grid,threads>>>(mp->d_b_accel_crust_mantle,
+        assemble_boundary_accel_on_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_accel_crust_mantle,
                                                             mp->d_b_send_accel_buffer_crust_mantle,
                                                             mp->num_interfaces_crust_mantle,
                                                             mp->max_nibool_interfaces_cm,
@@ -273,7 +334,10 @@
 
   // inner core region
   if( *IREGION == IREGION_INNER_CORE ){
-    if( mp->num_interfaces_inner_core > 0 ){
+
+    size_mpi_buffer = NDIM*(mp->max_nibool_interfaces_ic)*(mp->num_interfaces_inner_core);
+
+    if( size_mpi_buffer > 0 ){
       // assembles values
       blocksize = BLOCKSIZE_TRANSFER;
       size_padded = ((int)ceil(((double)mp->max_nibool_interfaces_ic)/((double)blocksize)))*blocksize;
@@ -284,13 +348,19 @@
       threads = dim3(blocksize,1,1);
 
       if(*FORWARD_OR_ADJOINT == 1) {
-        // copies buffer values to GPU
-        print_CUDA_error_if_any(cudaMemcpy(mp->d_send_accel_buffer_inner_core, buffer_recv_vector,
-                                           NDIM*(mp->max_nibool_interfaces_ic)*(mp->num_interfaces_inner_core)*sizeof(realw),
-                                           cudaMemcpyHostToDevice),41001);
 
+        if( GPU_ASYNC_COPY ){
+          // Wait until previous copy stream finishes. We assemble while other compute kernels execute.
+          cudaStreamSynchronize(mp->copy_stream);
+        }else{
+          // (cudaMemcpy implicitly synchronizes all other cuda operations)
+          // copies buffer values to GPU
+          print_CUDA_error_if_any(cudaMemcpy(mp->d_send_accel_buffer_inner_core,buffer_recv_vector,size_mpi_buffer*sizeof(realw),
+                                             cudaMemcpyHostToDevice),41001);
+        }
+
         //assemble forward accel
-        assemble_boundary_accel_on_device<<<grid,threads>>>(mp->d_accel_inner_core,
+        assemble_boundary_accel_on_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_accel_inner_core,
                                                             mp->d_send_accel_buffer_inner_core,
                                                             mp->num_interfaces_inner_core,
                                                             mp->max_nibool_interfaces_ic,
@@ -301,13 +371,18 @@
         // debug
         DEBUG_BACKWARD_ASSEMBLY();
 
-        // copies buffer values to GPU
-        print_CUDA_error_if_any(cudaMemcpy(mp->d_b_send_accel_buffer_inner_core, buffer_recv_vector,
-                                           NDIM*(mp->max_nibool_interfaces_ic)*(mp->num_interfaces_inner_core)*sizeof(realw),
-                                           cudaMemcpyHostToDevice),41001);
+        if( GPU_ASYNC_COPY ){
+          // Wait until previous copy stream finishes. We assemble while other compute kernels execute.
+          cudaStreamSynchronize(mp->copy_stream);
+        }else{
+          // (cudaMemcpy implicitly synchronizes all other cuda operations)
+          // copies buffer values to GPU
+          print_CUDA_error_if_any(cudaMemcpy(mp->d_b_send_accel_buffer_inner_core,buffer_recv_vector,size_mpi_buffer*sizeof(realw),
+                                             cudaMemcpyHostToDevice),41001);
+        }
 
         //assemble adjoint accel
-        assemble_boundary_accel_on_device<<<grid,threads>>>(mp->d_b_accel_inner_core,
+        assemble_boundary_accel_on_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_accel_inner_core,
                                                             mp->d_b_send_accel_buffer_inner_core,
                                                             mp->num_interfaces_inner_core,
                                                             mp->max_nibool_interfaces_ic,
@@ -321,3 +396,201 @@
   exit_on_cuda_error("transfer_asmbl_accel_to_device in inner_core");
 #endif
 }
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// Asynchronuous memory copy for mpi buffers
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(transfer_buffer_to_device_async,
+              TRANSFER_BUFFER_TO_DEVICE_ASYNC)(long* Mesh_pointer,
+                                             realw* buffer,
+                                             int* IREGION,
+                                             int* FORWARD_OR_ADJOINT) {
+
+// asynchronous transfer from host to device
+
+  TRACE("transfer_buffer_to_device_async");
+
+  int size_mpi_buffer;
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper
+
+  // checks async-memcpy
+  if( GPU_ASYNC_COPY == 0 ){
+    exit_on_error("transfer_buffer_to_device_async must be called with GPU_ASYNC_COPY == 1, please check mesh_constants_cuda.h");
+  }
+
+  // regions
+  if( *IREGION == IREGION_CRUST_MANTLE ){
+    // crust/mantle region
+    size_mpi_buffer = NDIM*mp->max_nibool_interfaces_cm*mp->num_interfaces_crust_mantle;
+
+    if( size_mpi_buffer > 0 ){
+
+      if(*FORWARD_OR_ADJOINT == 1) {
+        // copy on host memory
+        memcpy(mp->h_recv_accel_buffer_cm,buffer,size_mpi_buffer*sizeof(realw));
+
+        // asynchronous copy to GPU using copy_stream
+        cudaMemcpyAsync(mp->d_send_accel_buffer_crust_mantle,mp->h_recv_accel_buffer_cm,size_mpi_buffer*sizeof(realw),
+                        cudaMemcpyHostToDevice,mp->copy_stream);
+
+      }else if(*FORWARD_OR_ADJOINT == 3) {
+        // debug
+        DEBUG_BACKWARD_ASSEMBLY();
+
+        // copy on host memory
+        memcpy(mp->h_b_recv_accel_buffer_cm,buffer,size_mpi_buffer*sizeof(realw));
+
+        // asynchronous copy to GPU using copy_stream
+        cudaMemcpyAsync(mp->d_b_send_accel_buffer_crust_mantle,mp->h_b_recv_accel_buffer_cm,size_mpi_buffer*sizeof(realw),
+                        cudaMemcpyHostToDevice,mp->copy_stream);
+      }
+    }
+  }else if( *IREGION == IREGION_INNER_CORE ){
+    // inner core region
+    size_mpi_buffer = NDIM*mp->max_nibool_interfaces_ic*mp->num_interfaces_inner_core;
+
+    if( size_mpi_buffer > 0 ){
+
+      if(*FORWARD_OR_ADJOINT == 1) {
+        // copy on host memory
+        memcpy(mp->h_recv_accel_buffer_ic,buffer,size_mpi_buffer*sizeof(realw));
+
+        // asynchronous copy to GPU using copy_stream
+        cudaMemcpyAsync(mp->d_send_accel_buffer_inner_core,mp->h_recv_accel_buffer_ic,size_mpi_buffer*sizeof(realw),
+                        cudaMemcpyHostToDevice,mp->copy_stream);
+
+      }else if(*FORWARD_OR_ADJOINT == 3) {
+        // debug
+        DEBUG_BACKWARD_ASSEMBLY();
+
+        // copy on host memory
+        memcpy(mp->h_b_recv_accel_buffer_ic,buffer,size_mpi_buffer*sizeof(realw));
+
+        // asynchronous copy to GPU using copy_stream
+        cudaMemcpyAsync(mp->d_b_send_accel_buffer_inner_core,mp->h_b_recv_accel_buffer_ic,size_mpi_buffer*sizeof(realw),
+                        cudaMemcpyHostToDevice,mp->copy_stream);
+      }
+    }
+  }else if( *IREGION == IREGION_OUTER_CORE ){
+    // outer core region
+    size_mpi_buffer = mp->max_nibool_interfaces_oc*mp->num_interfaces_outer_core;
+
+    if( size_mpi_buffer > 0 ){
+
+      if(*FORWARD_OR_ADJOINT == 1) {
+        // copy on host memory
+        memcpy(mp->h_recv_accel_buffer_oc,buffer,size_mpi_buffer*sizeof(realw));
+
+        // asynchronous copy to GPU using copy_stream
+        cudaMemcpyAsync(mp->d_send_accel_buffer_outer_core,mp->h_recv_accel_buffer_oc,size_mpi_buffer*sizeof(realw),
+                        cudaMemcpyHostToDevice,mp->copy_stream);
+
+      }else if(*FORWARD_OR_ADJOINT == 3) {
+        // debug
+        DEBUG_BACKWARD_ASSEMBLY();
+
+        // copy on host memory
+        memcpy(mp->h_b_recv_accel_buffer_oc,buffer,size_mpi_buffer*sizeof(realw));
+
+        // asynchronous copy to GPU using copy_stream
+        cudaMemcpyAsync(mp->d_b_send_accel_buffer_outer_core,mp->h_b_recv_accel_buffer_oc,size_mpi_buffer*sizeof(realw),
+                        cudaMemcpyHostToDevice,mp->copy_stream);
+      }
+    }
+  }
+
+}
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(sync_copy_from_device,
+              SYNC_copy_FROM_DEVICE)(long* Mesh_pointer,
+                                     int* iphase,
+                                     realw* send_buffer,
+                                     int* IREGION,
+                                     int* FORWARD_OR_ADJOINT) {
+
+// synchronizes copy stream before copying buffers from pinned memory to CPU host
+
+  TRACE("sync_copy_from_device");
+
+  int size_mpi_buffer;
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper
+
+  // checks async-memcpy
+  if( GPU_ASYNC_COPY == 0 ){
+    exit_on_error("sync_copy_from_device must be called with GPU_ASYNC_COPY == 1, please check mesh_constants_cuda.h");
+  }
+
+  // Wait until async-memcpy of outer elements is finished and start MPI.
+  if( *iphase != 2 ){ exit_on_error("sync_copy_from_device must be called for iphase == 2"); }
+
+  // regions
+  if( *IREGION == IREGION_CRUST_MANTLE ){
+    // crust/mantle
+    size_mpi_buffer = NDIM*mp->max_nibool_interfaces_cm*mp->num_interfaces_crust_mantle;
+
+    if( size_mpi_buffer > 0 ){
+      // waits for asynchronous copy to finish
+      cudaStreamSynchronize(mp->copy_stream);
+
+      if(*FORWARD_OR_ADJOINT == 1) {
+        // There have been problems using the pinned-memory with MPI, so
+        // we copy the buffer into a non-pinned region.
+        memcpy(send_buffer,mp->h_send_accel_buffer_cm,size_mpi_buffer*sizeof(realw));
+
+      }else if(*FORWARD_OR_ADJOINT == 3){
+        // we copy the buffer into a non-pinned region.
+        memcpy(send_buffer,mp->h_b_send_accel_buffer_cm,size_mpi_buffer*sizeof(realw));
+      }
+    }
+  }else if( *IREGION == IREGION_INNER_CORE ){
+    // inner core
+    size_mpi_buffer = NDIM*mp->max_nibool_interfaces_ic*mp->num_interfaces_inner_core;
+
+    if( size_mpi_buffer > 0 ){
+      // waits for asynchronous copy to finish
+      cudaStreamSynchronize(mp->copy_stream);
+
+      if(*FORWARD_OR_ADJOINT == 1) {
+        // There have been problems using the pinned-memory with MPI, so
+        // we copy the buffer into a non-pinned region.
+        memcpy(send_buffer,mp->h_send_accel_buffer_ic,size_mpi_buffer*sizeof(realw));
+
+      }else if(*FORWARD_OR_ADJOINT == 3){
+        // we copy the buffer into a non-pinned region.
+        memcpy(send_buffer,mp->h_b_send_accel_buffer_ic,size_mpi_buffer*sizeof(realw));
+      }
+    }
+  }else if( *IREGION == IREGION_OUTER_CORE ){
+    // outer core
+    size_mpi_buffer = mp->max_nibool_interfaces_oc*mp->num_interfaces_outer_core;
+
+    if( size_mpi_buffer > 0 ){
+      // waits for asynchronous copy to finish
+      cudaStreamSynchronize(mp->copy_stream);
+
+      if(*FORWARD_OR_ADJOINT == 1) {
+        // There have been problems using the pinned-memory with MPI, so
+        // we copy the buffer into a non-pinned region.
+        memcpy(send_buffer,mp->h_send_accel_buffer_oc,size_mpi_buffer*sizeof(realw));
+
+      }else if(*FORWARD_OR_ADJOINT == 3){
+        // we copy the buffer into a non-pinned region.
+        memcpy(send_buffer,mp->h_b_send_accel_buffer_oc,size_mpi_buffer*sizeof(realw));
+      }
+    }
+  }
+
+  // memory copy is now finished, so non-blocking MPI send can proceed
+}
+

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/check_fields_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/check_fields_cuda.cu	2014-01-06 21:22:48 UTC (rev 22990)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/check_fields_cuda.cu	2014-01-13 16:06:21 UTC (rev 22991)
@@ -111,7 +111,7 @@
 #else
     myrank = 0;
 #endif
-    sprintf(filename,"../in_out_files/OUTPUT_FILES/error_message_%06d.txt",myrank);
+    sprintf(filename,"OUTPUT_FILES/error_message_%06d.txt",myrank);
     fp = fopen(filename,"a+");
     if (fp != NULL){
       fprintf(fp,"Error after %s: %s\n", kernel_name, cudaGetErrorString(err));
@@ -152,7 +152,7 @@
 #else
   myrank = 0;
 #endif
-  sprintf(filename,"../in_out_files/OUTPUT_FILES/error_message_%06d.txt",myrank);
+  sprintf(filename,"OUTPUT_FILES/error_message_%06d.txt",myrank);
   fp = fopen(filename,"a+");
   if (fp != NULL){
     fprintf(fp,"ERROR: %s\n",info);
@@ -185,7 +185,7 @@
 #else
     myrank = 0;
 #endif
-    sprintf(filename,"../in_out_files/OUTPUT_FILES/error_message_%06d.txt",myrank);
+    sprintf(filename,"OUTPUT_FILES/error_message_%06d.txt",myrank);
     fp = fopen(filename,"a+");
     if (fp != NULL){
       fprintf(fp,"\nCUDA error !!!!! <%s> !!!!! \nat CUDA call error code: # %d\n",cudaGetErrorString(err),num);
@@ -509,11 +509,15 @@
   cudaMalloc((void**)&d_max,num_blocks_x*num_blocks_y*sizeof(realw));
 
   if(*FORWARD_OR_ADJOINT == 1 ){
-    get_maximum_scalar_kernel<<<grid,threads>>>(mp->d_displ_outer_core,size,d_max);
+    get_maximum_scalar_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_displ_outer_core,size,d_max);
   }else if(*FORWARD_OR_ADJOINT == 3 ){
-    get_maximum_scalar_kernel<<<grid,threads>>>(mp->d_b_displ_outer_core,size,d_max);
+    get_maximum_scalar_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_displ_outer_core,size,d_max);
   }
 
+  // explicitly waits until previous compute stream finishes
+  // (cudaMemcpy implicitly synchronizes all other cuda operations)
+  cudaStreamSynchronize(mp->compute_stream);
+
   print_CUDA_error_if_any(cudaMemcpy(h_max,d_max,num_blocks_x*num_blocks_y*sizeof(realw),
                                      cudaMemcpyDeviceToHost),222);
 
@@ -650,11 +654,15 @@
   cudaMalloc((void**)&d_max,num_blocks_x*num_blocks_y*sizeof(realw));
 
   if(*FORWARD_OR_ADJOINT == 1 ){
-    get_maximum_vector_kernel<<<grid,threads>>>(mp->d_displ_crust_mantle,size,d_max);
+    get_maximum_vector_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_displ_crust_mantle,size,d_max);
   }else if(*FORWARD_OR_ADJOINT == 3 ){
-    get_maximum_vector_kernel<<<grid,threads>>>(mp->d_b_displ_crust_mantle,size,d_max);
+    get_maximum_vector_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_displ_crust_mantle,size,d_max);
   }
 
+  // explicitly waits until previous compute stream finishes
+  // (cudaMemcpy implicitly synchronizes all other cuda operations)
+  cudaStreamSynchronize(mp->compute_stream);
+
   // copies to CPU
   print_CUDA_error_if_any(cudaMemcpy(h_max,d_max,num_blocks_x*num_blocks_y*sizeof(realw),
                                      cudaMemcpyDeviceToHost),222);
@@ -753,7 +761,7 @@
   max = 0.0f;
 
   // determines max for: eps_trace_over_3_crust_mantle
-  get_maximum_scalar_kernel<<<grid,threads>>>(mp->d_eps_trace_over_3_crust_mantle,size,d_max);
+  get_maximum_scalar_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_eps_trace_over_3_crust_mantle,size,d_max);
 
   print_CUDA_error_if_any(cudaMemcpy(h_max,d_max,num_blocks_x*num_blocks_y*sizeof(realw),
                                      cudaMemcpyDeviceToHost),221);
@@ -785,7 +793,7 @@
   max_eps = 0.0f;
 
   // determines max for: epsilondev_xx_crust_mantle
-  get_maximum_scalar_kernel<<<grid,threads>>>(mp->d_epsilondev_xx_crust_mantle,size,d_max);
+  get_maximum_scalar_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_epsilondev_xx_crust_mantle,size,d_max);
 
   print_CUDA_error_if_any(cudaMemcpy(h_max,d_max,num_blocks_x*num_blocks_y*sizeof(realw),
                                      cudaMemcpyDeviceToHost),222);
@@ -796,7 +804,7 @@
   max_eps = MAX(max_eps,max);
 
   // determines max for: epsilondev_yy_crust_mantle
-  get_maximum_scalar_kernel<<<grid,threads>>>(mp->d_epsilondev_yy_crust_mantle,size,d_max);
+  get_maximum_scalar_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_epsilondev_yy_crust_mantle,size,d_max);
 
   print_CUDA_error_if_any(cudaMemcpy(h_max,d_max,num_blocks_x*num_blocks_y*sizeof(realw),
                                      cudaMemcpyDeviceToHost),223);
@@ -807,7 +815,7 @@
   max_eps = MAX(max_eps,max);
 
   // determines max for: epsilondev_xy_crust_mantle
-  get_maximum_scalar_kernel<<<grid,threads>>>(mp->d_epsilondev_xy_crust_mantle,size,d_max);
+  get_maximum_scalar_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_epsilondev_xy_crust_mantle,size,d_max);
 
   print_CUDA_error_if_any(cudaMemcpy(h_max,d_max,num_blocks_x*num_blocks_y*sizeof(realw),
                                      cudaMemcpyDeviceToHost),224);
@@ -818,7 +826,7 @@
   max_eps = MAX(max_eps,max);
 
   // determines max for: epsilondev_xz_crust_mantle
-  get_maximum_scalar_kernel<<<grid,threads>>>(mp->d_epsilondev_xz_crust_mantle,size,d_max);
+  get_maximum_scalar_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_epsilondev_xz_crust_mantle,size,d_max);
 
   print_CUDA_error_if_any(cudaMemcpy(h_max,d_max,num_blocks_x*num_blocks_y*sizeof(realw),
                                      cudaMemcpyDeviceToHost),225);
@@ -829,7 +837,7 @@
   max_eps = MAX(max_eps,max);
 
   // determines max for: epsilondev_yz_crust_mantle
-  get_maximum_scalar_kernel<<<grid,threads>>>(mp->d_epsilondev_yz_crust_mantle,size,d_max);
+  get_maximum_scalar_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_epsilondev_yz_crust_mantle,size,d_max);
 
   print_CUDA_error_if_any(cudaMemcpy(h_max,d_max,num_blocks_x*num_blocks_y*sizeof(realw),
                                      cudaMemcpyDeviceToHost),226);

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/compute_add_sources_elastic_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/compute_add_sources_elastic_cuda.cu	2014-01-06 21:22:48 UTC (rev 22990)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/compute_add_sources_elastic_cuda.cu	2014-01-13 16:06:21 UTC (rev 22991)
@@ -72,12 +72,9 @@
 
       // note: for global version, sourcearrays has dimensions
       //            sourcearrays(NDIM,NGLLX,NGLLY,NGLLZ,NSOURCES)
-      atomicAdd(&accel[3*iglob], sourcearrays[INDEX5(NDIM,NGLLX,NGLLX,NGLLX,
-                                                     0,i,j,k,isource)]*stf);
-      atomicAdd(&accel[3*iglob+1], sourcearrays[INDEX5(NDIM,NGLLX,NGLLX,NGLLX,
-                                                       1,i,j,k,isource)]*stf);
-      atomicAdd(&accel[3*iglob+2], sourcearrays[INDEX5(NDIM,NGLLX,NGLLX,NGLLX,
-                                                       2,i,j,k,isource)]*stf);
+      atomicAdd(&accel[3*iglob], sourcearrays[INDEX5(NDIM,NGLLX,NGLLX,NGLLX,0,i,j,k,isource)]*stf);
+      atomicAdd(&accel[3*iglob+1], sourcearrays[INDEX5(NDIM,NGLLX,NGLLX,NGLLX,1,i,j,k,isource)]*stf);
+      atomicAdd(&accel[3*iglob+2], sourcearrays[INDEX5(NDIM,NGLLX,NGLLX,NGLLX,2,i,j,k,isource)]*stf);
     }
   }
 }
@@ -111,7 +108,7 @@
                                      NSOURCES*sizeof(double),cudaMemcpyHostToDevice),71018);
 
   // adds source contributions
-  compute_add_sources_kernel<<<grid,threads>>>(mp->d_accel_crust_mantle,
+  compute_add_sources_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_accel_crust_mantle,
                                                mp->d_ibool_crust_mantle,
                                                mp->d_sourcearrays,
                                                mp->d_stf_pre_compute,
@@ -157,7 +154,7 @@
   print_CUDA_error_if_any(cudaMemcpy(mp->d_stf_pre_compute,h_stf_pre_compute,
                                      NSOURCES*sizeof(double),cudaMemcpyHostToDevice),71019);
 
-  compute_add_sources_kernel<<<grid,threads>>>(mp->d_b_accel_crust_mantle,
+  compute_add_sources_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_accel_crust_mantle,
                                                mp->d_ibool_crust_mantle,
                                                mp->d_sourcearrays,
                                                mp->d_stf_pre_compute,
@@ -202,14 +199,9 @@
     iglob = ibool[INDEX4(NGLLX,NGLLX,NGLLX,i,j,k,ispec)]-1;
 
     // atomic operations are absolutely necessary for correctness!
-    atomicAdd(&accel[3*iglob], adj_sourcearrays[INDEX5(NDIM,NGLLX,NGLLX,NGLLX,
-                                                       0,i,j,k,irec_local)]);
-
-    atomicAdd(&accel[3*iglob+1], adj_sourcearrays[INDEX5(NDIM,NGLLX,NGLLX,NGLLX,
-                                                         1,i,j,k,irec_local)]);
-
-    atomicAdd(&accel[3*iglob+2], adj_sourcearrays[INDEX5(NDIM,NGLLX,NGLLX,NGLLX,
-                                                         2,i,j,k,irec_local)]);
+    atomicAdd(&accel[3*iglob], adj_sourcearrays[INDEX5(NDIM,NGLLX,NGLLX,NGLLX,0,i,j,k,irec_local)]);
+    atomicAdd(&accel[3*iglob+1], adj_sourcearrays[INDEX5(NDIM,NGLLX,NGLLX,NGLLX,1,i,j,k,irec_local)]);
+    atomicAdd(&accel[3*iglob+2], adj_sourcearrays[INDEX5(NDIM,NGLLX,NGLLX,NGLLX,2,i,j,k,irec_local)]);
   }
 }
 
@@ -218,12 +210,11 @@
 extern "C"
 void FC_FUNC_(compute_add_sources_adjoint_cuda,
               COMPUTE_ADD_SOURCES_ADJOINT_CUDA)(long* Mesh_pointer,
-                                                int* nrec,
-                                                realw* h_adj_sourcearrays,
-                                                int* h_islice_selected_rec,
-                                                int* h_ispec_selected_rec,
-                                                int* time_index) {
+                                                int* h_nrec) {
 
+// adds adjoint sources
+// note: call this routine after transfer_adj_to_device**() to have correct adjoint sourcearrays in array d_adj_sourcearrays
+
   TRACE("compute_add_sources_adjoint_cuda");
 
   Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
@@ -231,47 +222,96 @@
   // check if anything to do
   if( mp->nadj_rec_local == 0 ) return;
 
+  // total number of receivers/adjoint sources
+  int nrec = *h_nrec;
+
+  // waits for previous transfer_** calls to be finished
+  if( GPU_ASYNC_COPY ){
+    // waits for asynchronous copy to finish
+    cudaStreamSynchronize(mp->copy_stream);
+  }
+
   int num_blocks_x, num_blocks_y;
   get_blocks_xy(mp->nadj_rec_local,&num_blocks_x,&num_blocks_y);
 
   dim3 grid(num_blocks_x,num_blocks_y,1);
   dim3 threads(NGLLX,NGLLX,NGLLX);
 
+  // the irec_local variable needs to be precomputed (as
+  // h_pre_comp..), because normally it is in the loop updating accel,
+  // and due to how it's incremented, it cannot be parallelized
+  compute_add_sources_adjoint_cuda_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_accel_crust_mantle,
+                                                                                 nrec,
+                                                                                 mp->d_adj_sourcearrays,
+                                                                                 mp->d_ibool_crust_mantle,
+                                                                                 mp->d_ispec_selected_rec,
+                                                                                 mp->d_pre_computed_irec,
+                                                                                 mp->nadj_rec_local);
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("compute_add_sources_adjoint_cuda");
+#endif
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// adjoint memory transfers
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(transfer_adj_to_device,
+              TRANSFER_ADJ_TO_DEVICE)(long* Mesh_pointer,
+                                      int* h_nrec,
+                                      realw* h_adj_sourcearrays,
+                                      int* h_islice_selected_rec) {
+
+// transfers adjoint source arrays synchronuously to GPU
+
+  TRACE("transfer_adj_to_device");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
+
+  // check if anything to do
+  if( mp->nadj_rec_local == 0 ) return;
+
+  // total number of receivers/adjoint sources
+  int nrec = *h_nrec;
+
   // build slice of adj_sourcearrays because full array is *very* large.
-  // note: this extracts array values for local adjoint sources at given time step "time_index"
+  //
+  // note: this copies array values for local adjoint sources at given time step "iadj_vec(it)"
   //          from large adj_sourcearrays array into h_adj_sourcearrays_slice
-  int i,j,k;
-  int irec_local = 0;
-  for(int irec = 0; irec < *nrec; irec++) {
+  //
+  // dimension of global array version
+  //   adj_sourcearrays is (NDIM,NGLLX,NGLLY,NGLLZ,nadj_rec_local,NTSTEP_BETWEEN_READ_ADJSRC)
+  // passed as function argument here is pointer to slice at time iadj_vec(it)
+  //    which has dimension (NDIM,NGLLX,NGLLY,NGLLZ,nadj_rec_local)
+  int i,j,k,irec_local;
+
+  irec_local = 0;
+  for(int irec = 0; irec < nrec; irec++) {
     if(mp->myrank == h_islice_selected_rec[irec]) {
-      irec_local++;
-
       // takes only local sources
       for(k=0;k<NGLLX;k++) {
         for(j=0;j<NGLLX;j++) {
           for(i=0;i<NGLLX;i++) {
+            mp->h_adj_sourcearrays_slice[INDEX5(NDIM,NGLLX,NGLLX,NGLLX,0,i,j,k,irec_local)]
+              = h_adj_sourcearrays[INDEX5(NDIM,NGLLX,NGLLX,NGLLX,0,i,j,k,irec_local)];
 
-            // note: global version uses dimensions
-            //          h_adj_sourcearrays(NDIM,NGLLX,NGLLY,NGLLZ,nadj_rec_local,NTSTEP_BETWEEN_READ_ADJSRC)
-            mp->h_adj_sourcearrays_slice[INDEX5(NDIM,NGLLX,NGLLX,NGLLX,
-                                                0,i,j,k,irec_local-1)]
-                    = h_adj_sourcearrays[INDEX6(NDIM,NGLLX,NGLLX,NGLLX,mp->nadj_rec_local,
-                                                0,i,j,k,irec_local-1,*time_index-1)];
+            mp->h_adj_sourcearrays_slice[INDEX5(NDIM,NGLLX,NGLLX,NGLLX,1,i,j,k,irec_local)]
+              = h_adj_sourcearrays[INDEX5(NDIM,NGLLX,NGLLX,NGLLX,1,i,j,k,irec_local)];
 
-            mp->h_adj_sourcearrays_slice[INDEX5(NDIM,NGLLX,NGLLX,NGLLX,
-                                                1,i,j,k,irec_local-1)]
-                    = h_adj_sourcearrays[INDEX6(NDIM,NGLLX,NGLLX,NGLLX,mp->nadj_rec_local,
-                                                1,i,j,k,irec_local-1,*time_index-1)];
-
-            mp->h_adj_sourcearrays_slice[INDEX5(NDIM,NGLLX,NGLLX,NGLLX,
-                                                2,i,j,k,irec_local-1)]
-                    = h_adj_sourcearrays[INDEX6(NDIM,NGLLX,NGLLX,NGLLX,mp->nadj_rec_local,
-                                                2,i,j,k,irec_local-1,*time_index-1)];
+            mp->h_adj_sourcearrays_slice[INDEX5(NDIM,NGLLX,NGLLX,NGLLX,2,i,j,k,irec_local)]
+              = h_adj_sourcearrays[INDEX5(NDIM,NGLLX,NGLLX,NGLLX,2,i,j,k,irec_local)];
           }
         }
       }
+      // increases local receivers counter
+      irec_local++;
     }
   }
+
   // check all local sources were added
   if( irec_local != mp->nadj_rec_local) exit_on_error("irec_local not equal to nadj_rec_local\n");
 
@@ -279,20 +319,85 @@
   print_CUDA_error_if_any(cudaMemcpy(mp->d_adj_sourcearrays, mp->h_adj_sourcearrays_slice,
                                      (mp->nadj_rec_local)*NDIM*NGLL3*sizeof(realw),cudaMemcpyHostToDevice),71000);
 
-
-  // the irec_local variable needs to be precomputed (as
-  // h_pre_comp..), because normally it is in the loop updating accel,
-  // and due to how it's incremented, it cannot be parallelized
-  compute_add_sources_adjoint_cuda_kernel<<<grid,threads>>>(mp->d_accel_crust_mantle,
-                                                            *nrec,
-                                                            mp->d_adj_sourcearrays,
-                                                            mp->d_ibool_crust_mantle,
-                                                            mp->d_ispec_selected_rec,
-                                                            mp->d_pre_computed_irec,
-                                                            mp->nadj_rec_local);
-
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-  exit_on_cuda_error("compute_add_sources_adjoint_cuda");
+  exit_on_cuda_error("transfer_adj_to_device");
 #endif
 }
 
+/* ----------------------------------------------------------------------------------------------- */
+
+
+extern "C"
+void FC_FUNC_(transfer_adj_to_device_async,
+              TRANSFER_ADJ_TO_DEVICE_ASYNC)(long* Mesh_pointer,
+                                            int* h_nrec,
+                                            realw* h_adj_sourcearrays,
+                                            int* h_islice_selected_rec) {
+
+// asynchronous transfer for next adjoint source arrays from host to device
+
+  TRACE("transfer_adj_to_device_async");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
+
+  // check if anything to do
+  if( mp->nadj_rec_local == 0 ) return;
+
+  // checks async-memcpy
+  if( GPU_ASYNC_COPY == 0 ){
+    exit_on_error("transfer_adj_to_device_async must be called with GPU_ASYNC_COPY == 1, please check mesh_constants_cuda.h");
+  }
+
+  // total number of receivers/adjoint sources
+  int nrec = *h_nrec;
+
+  // build slice of adj_sourcearrays because full array is *very* large.
+  //
+  // note: this copies array values for local adjoint sources at given time step "iadj_vec(it)"
+  //          from large adj_sourcearrays array into h_adj_sourcearrays_slice
+  //
+  // dimension of global array version
+  //   adj_sourcearrays is (NDIM,NGLLX,NGLLY,NGLLZ,nadj_rec_local,NTSTEP_BETWEEN_READ_ADJSRC)
+  // passed as function argument here is pointer to slice at time iadj_vec(it)
+  //    which has dimension (NDIM,NGLLX,NGLLY,NGLLZ,nadj_rec_local)
+  int i,j,k,irec_local;
+
+  // waits for previous copy_stream call to be finished
+  cudaStreamSynchronize(mp->copy_stream);
+
+  irec_local = 0;
+  for(int irec = 0; irec < nrec; irec++) {
+    if(mp->myrank == h_islice_selected_rec[irec]) {
+      // takes only local sources
+      for(k=0;k<NGLLX;k++) {
+        for(j=0;j<NGLLX;j++) {
+          for(i=0;i<NGLLX;i++) {
+            mp->h_adj_sourcearrays_slice[INDEX5(NDIM,NGLLX,NGLLX,NGLLX,0,i,j,k,irec_local)]
+              = h_adj_sourcearrays[INDEX5(NDIM,NGLLX,NGLLX,NGLLX,0,i,j,k,irec_local)];
+
+            mp->h_adj_sourcearrays_slice[INDEX5(NDIM,NGLLX,NGLLX,NGLLX,1,i,j,k,irec_local)]
+              = h_adj_sourcearrays[INDEX5(NDIM,NGLLX,NGLLX,NGLLX,1,i,j,k,irec_local)];
+
+            mp->h_adj_sourcearrays_slice[INDEX5(NDIM,NGLLX,NGLLX,NGLLX,2,i,j,k,irec_local)]
+              = h_adj_sourcearrays[INDEX5(NDIM,NGLLX,NGLLX,NGLLX,2,i,j,k,irec_local)];
+          }
+        }
+      }
+      // increases local receivers counter
+      irec_local++;
+    }
+  }
+
+  // check all local sources were added
+  if( irec_local != mp->nadj_rec_local) exit_on_error("irec_local not equal to nadj_rec_local\n");
+
+  // waits for previous compute_add_sources_adjoint_cuda_kernel() call to be finished
+  cudaStreamSynchronize(mp->compute_stream);
+
+  // copies extracted array values onto GPU
+  // (asynchronous copy to GPU using copy_stream)
+  cudaMemcpyAsync(mp->d_adj_sourcearrays, mp->h_adj_sourcearrays_slice,(mp->nadj_rec_local)*NDIM*NGLL3*sizeof(realw),
+                  cudaMemcpyHostToDevice,mp->copy_stream);
+
+}
+

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/compute_coupling_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/compute_coupling_cuda.cu	2014-01-06 21:22:48 UTC (rev 22990)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/compute_coupling_cuda.cu	2014-01-13 16:06:21 UTC (rev 22991)
@@ -125,7 +125,7 @@
 
   // launches GPU kernel
   if( *FORWARD_OR_ADJOINT == 1 ){
-    compute_coupling_fluid_CMB_kernel<<<grid,threads>>>(mp->d_displ_crust_mantle,
+    compute_coupling_fluid_CMB_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_displ_crust_mantle,
                                                         mp->d_accel_outer_core,
                                                         mp->d_ibool_crust_mantle,
                                                         mp->d_ibelm_bottom_crust_mantle,
@@ -140,7 +140,7 @@
     DEBUG_BACKWARD_COUPLING();
 
     // adjoint simulations
-    compute_coupling_fluid_CMB_kernel<<<grid,threads>>>(mp->d_b_displ_crust_mantle,
+    compute_coupling_fluid_CMB_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_displ_crust_mantle,
                                                         mp->d_b_accel_outer_core,
                                                         mp->d_ibool_crust_mantle,
                                                         mp->d_ibelm_bottom_crust_mantle,
@@ -245,7 +245,7 @@
 
   // launches GPU kernel
   if( *FORWARD_OR_ADJOINT == 1 ){
-    compute_coupling_fluid_ICB_kernel<<<grid,threads>>>(mp->d_displ_inner_core,
+    compute_coupling_fluid_ICB_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_displ_inner_core,
                                                         mp->d_accel_outer_core,
                                                         mp->d_ibool_inner_core,
                                                         mp->d_ibelm_top_inner_core,
@@ -260,7 +260,7 @@
     DEBUG_BACKWARD_COUPLING();
 
     // adjoint simulations
-    compute_coupling_fluid_ICB_kernel<<<grid,threads>>>(mp->d_b_displ_inner_core,
+    compute_coupling_fluid_ICB_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_displ_inner_core,
                                                         mp->d_b_accel_outer_core,
                                                         mp->d_ibool_inner_core,
                                                         mp->d_ibelm_top_inner_core,
@@ -371,7 +371,7 @@
 
   // launches GPU kernel
   if( *FORWARD_OR_ADJOINT == 1 ){
-    compute_coupling_CMB_fluid_kernel<<<grid,threads>>>(mp->d_displ_crust_mantle,
+    compute_coupling_CMB_fluid_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_displ_crust_mantle,
                                                         mp->d_accel_crust_mantle,
                                                         mp->d_accel_outer_core,
                                                         mp->d_ibool_crust_mantle,
@@ -390,7 +390,7 @@
     DEBUG_BACKWARD_COUPLING();
 
     //  adjoint simulations
-    compute_coupling_CMB_fluid_kernel<<<grid,threads>>>(mp->d_b_displ_crust_mantle,
+    compute_coupling_CMB_fluid_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_displ_crust_mantle,
                                                         mp->d_b_accel_crust_mantle,
                                                         mp->d_b_accel_outer_core,
                                                         mp->d_ibool_crust_mantle,
@@ -504,7 +504,7 @@
 
   // launches GPU kernel
   if( *FORWARD_OR_ADJOINT == 1 ){
-    compute_coupling_ICB_fluid_kernel<<<grid,threads>>>(mp->d_displ_inner_core,
+    compute_coupling_ICB_fluid_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_displ_inner_core,
                                                         mp->d_accel_inner_core,
                                                         mp->d_accel_outer_core,
                                                         mp->d_ibool_inner_core,
@@ -523,7 +523,7 @@
     DEBUG_BACKWARD_COUPLING();
 
     //  adjoint simulations
-    compute_coupling_ICB_fluid_kernel<<<grid,threads>>>(mp->d_b_displ_inner_core,
+    compute_coupling_ICB_fluid_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_displ_inner_core,
                                                         mp->d_b_accel_inner_core,
                                                         mp->d_b_accel_outer_core,
                                                         mp->d_ibool_inner_core,
@@ -624,7 +624,7 @@
 
   // uses corrected mass matrices
   if( *FORWARD_OR_ADJOINT == 1 ){
-    compute_coupling_ocean_cuda_kernel<<<grid,threads>>>(mp->d_accel_crust_mantle,
+    compute_coupling_ocean_cuda_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_accel_crust_mantle,
                                                          mp->d_rmassx_crust_mantle,
                                                          mp->d_rmassy_crust_mantle,
                                                          mp->d_rmassz_crust_mantle,
@@ -637,7 +637,7 @@
     DEBUG_BACKWARD_COUPLING();
 
     // for backward/reconstructed potentials
-    compute_coupling_ocean_cuda_kernel<<<grid,threads>>>(mp->d_b_accel_crust_mantle,
+    compute_coupling_ocean_cuda_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_accel_crust_mantle,
                                                          mp->d_b_rmassx_crust_mantle,
                                                          mp->d_b_rmassy_crust_mantle,
                                                          mp->d_b_rmassz_crust_mantle,

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/compute_forces_crust_mantle_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/compute_forces_crust_mantle_cuda.cu	2014-01-06 21:22:48 UTC (rev 22990)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/compute_forces_crust_mantle_cuda.cu	2014-01-13 16:06:21 UTC (rev 22991)
@@ -59,11 +59,11 @@
 // templates for texture fetching
 // FORWARD_OR_ADJOINT == 1 <- forward arrays
 template<> __device__ float texfetch_displ_cm<1>(int x) { return tex1Dfetch(d_displ_cm_tex, x); }
-template<> __device__ float texfetch_veloc_cm<1>(int x) { return tex1Dfetch(d_veloc_cm_tex, x); }
+//template<> __device__ float texfetch_veloc_cm<1>(int x) { return tex1Dfetch(d_veloc_cm_tex, x); }
 template<> __device__ float texfetch_accel_cm<1>(int x) { return tex1Dfetch(d_accel_cm_tex, x); }
 // FORWARD_OR_ADJOINT == 3 <- backward/reconstructed arrays
 template<> __device__ float texfetch_displ_cm<3>(int x) { return tex1Dfetch(d_b_displ_cm_tex, x); }
-template<> __device__ float texfetch_veloc_cm<3>(int x) { return tex1Dfetch(d_b_veloc_cm_tex, x); }
+//template<> __device__ float texfetch_veloc_cm<3>(int x) { return tex1Dfetch(d_b_veloc_cm_tex, x); }
 template<> __device__ float texfetch_accel_cm<3>(int x) { return tex1Dfetch(d_b_accel_cm_tex, x); }
 #endif
 
@@ -469,8 +469,8 @@
   realw costwothetasq,costwophisq,sintwophisq;
   realw etaminone,twoetaminone;
   realw two_eta_aniso,four_eta_aniso,six_eta_aniso;
-  realw two_rhovsvsq,two_rhovshsq; // two_rhovpvsq,two_rhovphsq
-  realw four_rhovsvsq,four_rhovshsq; // four_rhovpvsq,four_rhovphsq
+  realw two_rhovsvsq,two_rhovshsq;
+  realw four_rhovsvsq,four_rhovshsq;
   realw c11,c12,c13,c14,c15,c16,c22,c23,c24,c25,c26,c33,c34,c35,c36,c44,c45,c46,c55,c56,c66;
 
   // cosine and sine function in CUDA only supported for float
@@ -567,23 +567,17 @@
   twoetaminone = 2.0f * eta_aniso - 1.0f;
 
   // precompute some products to reduce the CPU time
-
   two_eta_aniso = 2.0f * eta_aniso;
   four_eta_aniso = 4.0f * eta_aniso;
   six_eta_aniso = 6.0f * eta_aniso;
 
-  //two_rhovpvsq = 2.0f * rhovpvsq;
-  //two_rhovphsq = 2.0f * rhovphsq;
   two_rhovsvsq = 2.0f * rhovsvsq;
   two_rhovshsq = 2.0f * rhovshsq;
 
-  //four_rhovpvsq = 4.0f * rhovpvsq;
-  //four_rhovphsq = 4.0f * rhovphsq;
   four_rhovsvsq = 4.0f * rhovsvsq;
   four_rhovshsq = 4.0f * rhovshsq;
 
   // the 21 anisotropic coefficients computed using Mathematica
-
   c11 = rhovphsq*sinphifour + 2.0f*cosphisq*sinphisq*
         (rhovphsq*costhetasq + (eta_aniso*rhovphsq + two_rhovsvsq - two_eta_aniso*rhovsvsq)*
         sinthetasq) + cosphifour*
@@ -1285,7 +1279,6 @@
 
     // update memory variables based upon the Runge-Kutta scheme
     if( ATTENUATION && ( ! PARTIAL_PHYS_DISPERSION_ONLY ) ){
-
       compute_element_cm_att_memory(tx,working_element,
                                     d_muvstore,
                                     factor_common,alphaval,betaval,gammaval,
@@ -1378,7 +1371,7 @@
   // cudaEventRecord( start, 0 );
 
   if( FORWARD_OR_ADJOINT == 1 ){
-    Kernel_2_crust_mantle_impl<<<grid,threads>>>(nb_blocks_to_compute,
+    Kernel_2_crust_mantle_impl<<<grid,threads,0,mp->compute_stream>>>(nb_blocks_to_compute,
                                                   mp->NGLOB_CRUST_MANTLE,
                                                   d_ibool,
                                                   d_ispec_is_tiso,
@@ -1428,7 +1421,7 @@
     // debug
     DEBUG_BACKWARD_FORCES();
 
-    Kernel_2_crust_mantle_impl<<< grid,threads>>>(nb_blocks_to_compute,
+    Kernel_2_crust_mantle_impl<<< grid,threads,0,mp->compute_stream>>>(nb_blocks_to_compute,
                                                    mp->NGLOB_CRUST_MANTLE,
                                                    d_ibool,
                                                    d_ispec_is_tiso,

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/compute_forces_inner_core_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/compute_forces_inner_core_cuda.cu	2014-01-06 21:22:48 UTC (rev 22990)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/compute_forces_inner_core_cuda.cu	2014-01-13 16:06:21 UTC (rev 22991)
@@ -978,7 +978,7 @@
   // cudaEventCreate(&stop);
   // cudaEventRecord( start, 0 );
   if( FORWARD_OR_ADJOINT == 1 ){
-    Kernel_2_inner_core_impl<<<grid,threads>>>(nb_blocks_to_compute,
+    Kernel_2_inner_core_impl<<<grid,threads,0,mp->compute_stream>>>(nb_blocks_to_compute,
                                                mp->NGLOB_INNER_CORE,
                                                d_ibool,
                                                d_idoubling,
@@ -1026,7 +1026,7 @@
     // debug
     DEBUG_BACKWARD_FORCES();
 
-    Kernel_2_inner_core_impl<<< grid,threads>>>(nb_blocks_to_compute,
+    Kernel_2_inner_core_impl<<< grid,threads,0,mp->compute_stream>>>(nb_blocks_to_compute,
                                                 mp->NGLOB_INNER_CORE,
                                                 d_ibool,
                                                 d_idoubling,

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/compute_forces_outer_core_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/compute_forces_outer_core_cuda.cu	2014-01-06 21:22:48 UTC (rev 22990)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/compute_forces_outer_core_cuda.cu	2014-01-13 16:06:21 UTC (rev 22991)
@@ -494,7 +494,7 @@
   // cudaEventRecord( start, 0 );
 
   if( FORWARD_OR_ADJOINT == 1 ){
-    Kernel_2_outer_core_impl<<<grid,threads>>>(nb_blocks_to_compute,
+    Kernel_2_outer_core_impl<<<grid,threads,0,mp->compute_stream>>>(nb_blocks_to_compute,
                                                 mp->NGLOB_OUTER_CORE,
                                                 d_ibool,
                                                 mp->d_phase_ispec_inner_outer_core,
@@ -525,7 +525,7 @@
     // debug
     DEBUG_BACKWARD_FORCES();
 
-    Kernel_2_outer_core_impl<<<grid,threads>>>(nb_blocks_to_compute,
+    Kernel_2_outer_core_impl<<<grid,threads,0,mp->compute_stream>>>(nb_blocks_to_compute,
                                                 mp->NGLOB_OUTER_CORE,
                                                 d_ibool,
                                                 mp->d_phase_ispec_inner_outer_core,

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/compute_kernels_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/compute_kernels_cuda.cu	2014-01-06 21:22:48 UTC (rev 22990)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/compute_kernels_cuda.cu	2014-01-13 16:06:21 UTC (rev 22991)
@@ -515,7 +515,7 @@
   dim3 threads(blocksize,1,1);
 
   // density kernel
-  compute_kernels_rho_cudakernel<<<grid,threads>>>(mp->d_ibool_crust_mantle,
+  compute_kernels_rho_cudakernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_ibool_crust_mantle,
                                                    mp->d_accel_crust_mantle,
                                                    mp->d_b_displ_crust_mantle,
                                                    mp->d_rho_kl_crust_mantle,
@@ -533,7 +533,7 @@
     // computes strain locally based on current backward/reconstructed (b_displ) wavefield
     if(! mp->anisotropic_kl){
       // isotropic kernels
-      compute_kernels_iso_undo_att_cudakernel<<<grid,threads>>>(mp->d_epsilondev_xx_crust_mantle,
+      compute_kernels_iso_undo_att_cudakernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_epsilondev_xx_crust_mantle,
                                                        mp->d_epsilondev_yy_crust_mantle,
                                                        mp->d_epsilondev_xy_crust_mantle,
                                                        mp->d_epsilondev_xz_crust_mantle,
@@ -551,7 +551,7 @@
                                                        mp->d_hprime_xx);
     }else{
       // anisotropic kernels
-      compute_kernels_ani_undo_att_cudakernel<<<grid,threads>>>(mp->d_epsilondev_xx_crust_mantle,
+      compute_kernels_ani_undo_att_cudakernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_epsilondev_xx_crust_mantle,
                                                        mp->d_epsilondev_yy_crust_mantle,
                                                        mp->d_epsilondev_xy_crust_mantle,
                                                        mp->d_epsilondev_xz_crust_mantle,
@@ -572,7 +572,7 @@
     // takes strain arrays computed from previous compute_forces call
     if(! mp->anisotropic_kl){
       // isotropic kernels
-    compute_kernels_iso_cudakernel<<<grid,threads>>>(mp->d_epsilondev_xx_crust_mantle,
+    compute_kernels_iso_cudakernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_epsilondev_xx_crust_mantle,
                                                      mp->d_epsilondev_yy_crust_mantle,
                                                      mp->d_epsilondev_xy_crust_mantle,
                                                      mp->d_epsilondev_xz_crust_mantle,
@@ -590,7 +590,7 @@
                                                      deltat);
     }else{
       // anisotropic kernels
-    compute_kernels_ani_cudakernel<<<grid,threads>>>(mp->d_epsilondev_xx_crust_mantle,
+    compute_kernels_ani_cudakernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_epsilondev_xx_crust_mantle,
                                                      mp->d_epsilondev_yy_crust_mantle,
                                                      mp->d_epsilondev_xy_crust_mantle,
                                                      mp->d_epsilondev_xz_crust_mantle,
@@ -640,7 +640,7 @@
 
   // only isotropic kernels in inner core so far implemented
   // density kernel
-  compute_kernels_rho_cudakernel<<<grid,threads>>>(mp->d_ibool_inner_core,
+  compute_kernels_rho_cudakernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_ibool_inner_core,
                                                    mp->d_accel_inner_core,
                                                    mp->d_b_displ_inner_core,
                                                    mp->d_rho_kl_inner_core,
@@ -657,7 +657,7 @@
 
     // computes strain locally based on current backward/reconstructed (b_displ) wavefield
     // isotropic kernels (shear, bulk)
-    compute_kernels_iso_undo_att_cudakernel<<<grid,threads>>>(mp->d_epsilondev_xx_inner_core,
+    compute_kernels_iso_undo_att_cudakernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_epsilondev_xx_inner_core,
                                                                mp->d_epsilondev_yy_inner_core,
                                                                mp->d_epsilondev_xy_inner_core,
                                                                mp->d_epsilondev_xz_inner_core,
@@ -678,7 +678,7 @@
     // takes strain arrays computed from previous compute_forces call
 
     // isotropic kernels (shear, bulk)
-    compute_kernels_iso_cudakernel<<<grid,threads>>>(mp->d_epsilondev_xx_inner_core,
+    compute_kernels_iso_cudakernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_epsilondev_xx_inner_core,
                                                      mp->d_epsilondev_yy_inner_core,
                                                      mp->d_epsilondev_xy_inner_core,
                                                      mp->d_epsilondev_xz_inner_core,
@@ -881,7 +881,7 @@
   dim3 grid(num_blocks_x,num_blocks_y);
   dim3 threads(blocksize,1,1);
 
-  compute_kernels_acoustic_kernel<<<grid,threads>>>(mp->d_ibool_outer_core,
+  compute_kernels_acoustic_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_ibool_outer_core,
                                                     mp->d_rhostore_outer_core,
                                                     mp->d_kappavstore_outer_core,
                                                     mp->d_hprime_xx,
@@ -975,7 +975,7 @@
                                      cudaMemcpyHostToDevice),90900);
 
   // calculates noise strength kernel
-  compute_kernels_strength_noise_cuda_kernel<<<grid,threads>>>(mp->d_displ_crust_mantle,
+  compute_kernels_strength_noise_cuda_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_displ_crust_mantle,
                                                                mp->d_ibelm_top_crust_mantle,
                                                                mp->d_ibool_crust_mantle,
                                                                mp->d_noise_surface_movie,
@@ -1032,7 +1032,7 @@
   Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
 
   // checks
-  if( ! mp->approximate_hess_kl ){exit_on_cuda_error("approximate_hess_kl flag not properly initialized");}
+  if( ! mp->approximate_hess_kl ){exit_on_error("approximate_hess_kl flag not properly initialized");}
 
   int blocksize = NGLL3; // NGLLX*NGLLY*NGLLZ
   realw deltat = *deltat_f;
@@ -1043,7 +1043,7 @@
   dim3 grid(num_blocks_x,num_blocks_y);
   dim3 threads(blocksize,1,1);
 
-  compute_kernels_hess_cudakernel<<<grid,threads>>>(mp->d_ibool_crust_mantle,
+  compute_kernels_hess_cudakernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_ibool_crust_mantle,
                                                     mp->d_accel_crust_mantle,
                                                     mp->d_b_accel_crust_mantle,
                                                     mp->d_hess_kl_crust_mantle,

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/compute_stacey_acoustic_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/compute_stacey_acoustic_cuda.cu	2014-01-06 21:22:48 UTC (rev 22990)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/compute_stacey_acoustic_cuda.cu	2014-01-13 16:06:21 UTC (rev 22991)
@@ -230,7 +230,7 @@
       break;
 
     default:
-      exit_on_cuda_error("compute_stacey_acoustic_cuda: unknown interface type");
+      exit_on_error("compute_stacey_acoustic_cuda: unknown interface type");
       break;
   }
 
@@ -251,7 +251,7 @@
   dim3 grid(num_blocks_x,num_blocks_y);
   dim3 threads(blocksize,1,1);
 
-  compute_stacey_acoustic_kernel<<<grid,threads>>>(mp->d_veloc_outer_core,
+  compute_stacey_acoustic_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_veloc_outer_core,
                                                    mp->d_accel_outer_core,
                                                    interface_type,
                                                    num_abs_boundary_faces,
@@ -270,7 +270,11 @@
                                                    d_b_absorb_potential);
 
   //  adjoint simulations: stores absorbed wavefield part
-  if( mp->save_forward && num_abs_boundary_faces > 0 ){
+  if( mp->save_forward ){
+    // explicitly waits until previous compute stream finishes
+    // (cudaMemcpy implicitly synchronizes all other cuda operations)
+    cudaStreamSynchronize(mp->compute_stream);
+
     // copies array to CPU
     print_CUDA_error_if_any(cudaMemcpy(absorb_potential,d_b_absorb_potential,
                                        NGLL2*num_abs_boundary_faces*sizeof(realw),
@@ -437,7 +441,7 @@
       break;
 
     default:
-      exit_on_cuda_error("compute_stacey_acoustic_cuda: unknown interface type");
+      exit_on_error("compute_stacey_acoustic_cuda: unknown interface type");
       break;
   }
 
@@ -466,7 +470,7 @@
                                        cudaMemcpyHostToDevice),7700);
   }
 
-  compute_stacey_acoustic_backward_kernel<<<grid,threads>>>(mp->d_b_accel_outer_core,
+  compute_stacey_acoustic_backward_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_accel_outer_core,
                                                             d_b_absorb_potential,
                                                             interface_type,
                                                             num_abs_boundary_faces,
@@ -552,7 +556,7 @@
       break;
 
     default:
-      exit_on_cuda_error("compute_stacey_acoustic_undoatt_cuda: unknown interface type");
+      exit_on_error("compute_stacey_acoustic_undoatt_cuda: unknown interface type");
       break;
   }
 
@@ -573,7 +577,7 @@
   dim3 grid(num_blocks_x,num_blocks_y);
   dim3 threads(blocksize,1,1);
 
-  compute_stacey_acoustic_kernel<<<grid,threads>>>(mp->d_b_veloc_outer_core,
+  compute_stacey_acoustic_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_veloc_outer_core,
                                                    mp->d_b_accel_outer_core,
                                                    interface_type,
                                                    num_abs_boundary_faces,

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/compute_stacey_elastic_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/compute_stacey_elastic_cuda.cu	2014-01-06 21:22:48 UTC (rev 22990)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/compute_stacey_elastic_cuda.cu	2014-01-13 16:06:21 UTC (rev 22991)
@@ -237,7 +237,7 @@
       break;
 
     default:
-      exit_on_cuda_error("compute_stacey_elastic_cuda: unknown interface type");
+      exit_on_error("compute_stacey_elastic_cuda: unknown interface type");
       break;
   }
 
@@ -259,7 +259,7 @@
   dim3 threads(blocksize,1,1);
 
   // absorbing boundary contributions
-  compute_stacey_elastic_kernel<<<grid,threads>>>(mp->d_veloc_crust_mantle,
+  compute_stacey_elastic_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_veloc_crust_mantle,
                                                   mp->d_accel_crust_mantle,
                                                   interface_type,
                                                   num_abs_boundary_faces,
@@ -281,7 +281,11 @@
 
 
   // adjoint simulations: stores absorbed wavefield part
-  if(mp->save_forward && num_abs_boundary_faces > 0 ) {
+  if( mp->save_forward ) {
+    // explicitly waits until previous compute stream finishes
+    // (cudaMemcpy implicitly synchronizes all other cuda operations)
+    cudaStreamSynchronize(mp->compute_stream);
+
     // copies array to CPU
     print_CUDA_error_if_any(cudaMemcpy(absorb_field,d_b_absorb_field,
                             NDIM*NGLL2*num_abs_boundary_faces*sizeof(realw),cudaMemcpyDeviceToHost),7701);
@@ -438,7 +442,7 @@
       break;
 
     default:
-      exit_on_cuda_error("compute_stacey_elastic_cuda: unknown interface type");
+      exit_on_error("compute_stacey_elastic_cuda: unknown interface type");
       break;
   }
 
@@ -465,7 +469,7 @@
                                      NDIM*NGLL2*num_abs_boundary_faces*sizeof(realw),cudaMemcpyHostToDevice),7700);
 
   // absorbing boundary contributions
-  compute_stacey_elastic_backward_kernel<<<grid,threads>>>(mp->d_b_accel_crust_mantle,
+  compute_stacey_elastic_backward_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_accel_crust_mantle,
                                                            d_b_absorb_field,
                                                            interface_type,
                                                            num_abs_boundary_faces,
@@ -548,7 +552,7 @@
       break;
 
     default:
-      exit_on_cuda_error("compute_stacey_elastic_undoatt_cuda: unknown interface type");
+      exit_on_error("compute_stacey_elastic_undoatt_cuda: unknown interface type");
       break;
   }
 
@@ -570,7 +574,7 @@
   dim3 threads(blocksize,1,1);
 
   // absorbing boundary contributions
-  compute_stacey_elastic_kernel<<<grid,threads>>>(mp->d_b_veloc_crust_mantle,
+  compute_stacey_elastic_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_veloc_crust_mantle,
                                                   mp->d_b_accel_crust_mantle,
                                                   interface_type,
                                                   num_abs_boundary_faces,

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/initialize_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/initialize_cuda.cu	2014-01-06 21:22:48 UTC (rev 22990)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/initialize_cuda.cu	2014-01-13 16:06:21 UTC (rev 22991)
@@ -88,6 +88,8 @@
   // Gets number of GPU devices
   device_count = 0;
   cudaGetDeviceCount(&device_count);
+
+  // checks if command failed
   exit_on_cuda_error("CUDA runtime error: cudaGetDeviceCount failed\n\nplease check if driver and runtime libraries work together\nor on titan enable environment: CRAY_CUDA_PROXY=1 to use single GPU with multiple MPI processes\n\nexiting...\n");
 
   // returns device count to fortran

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/mesh_constants_cuda.h
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/mesh_constants_cuda.h	2014-01-06 21:22:48 UTC (rev 22990)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/mesh_constants_cuda.h	2014-01-13 16:06:21 UTC (rev 22991)
@@ -52,6 +52,7 @@
 
 /* ----------------------------------------------------------------------------------------------- */
 
+// debug: outputs traces
 #define DEBUG 0
 #if DEBUG == 1
 #define TRACE(x) printf("%s\n",x);
@@ -59,6 +60,7 @@
 #define TRACE(x) // printf("%s\n",x);
 #endif
 
+// more outputs
 #define MAXDEBUG 0
 #if MAXDEBUG == 1
 #define LOG(x) printf("%s\n",x)
@@ -90,13 +92,14 @@
 #define DEBUG_BACKWARD_UPDATE()
 #endif
 
-
 // error checking after cuda function calls
-#define ENABLE_VERY_SLOW_ERROR_CHECKING
+// (note: this synchronizes many calls, thus e.g. no asynchronuous memcpy possible)
+//#define ENABLE_VERY_SLOW_ERROR_CHECKING
 
 // maximum function
 #define MAX(x,y)                    (((x) < (y)) ? (y) : (x))
 
+
 /* ----------------------------------------------------------------------------------------------- */
 
 // cuda constant arrays
@@ -122,7 +125,8 @@
 
 // region ids
 #define IREGION_CRUST_MANTLE  1
-#define IREGION_INNER_CORE  3
+#define IREGION_OUTER_CORE    2
+#define IREGION_INNER_CORE    3
 
 // inner core : fictitious elements id (from constants.h)
 #define IFLAG_IN_FICTITIOUS_CUBE  11
@@ -132,6 +136,9 @@
 // uncomment line below for PREM with oceans
 //#define R_EARTH_KM 6368.0f
 
+// Asynchronuous memory copies between GPU and CPU
+#define GPU_ASYNC_COPY 1
+
 /* ----------------------------------------------------------------------------------------------- */
 
 // (optional) pre-processing directive used in kernels: if defined check that it is also set in src/shared/constants.h:
@@ -199,22 +206,14 @@
 
 // indexing
 
-#define INDEX2(xsize,x,y) x + (y)*xsize
+#define INDEX2(isize,i,j) i + isize*j
+#define INDEX3(isize,jsize,i,j,k) i + isize*(j + jsize*k)
+#define INDEX4(isize,jsize,ksize,i,j,k,x) i + isize*(j + jsize*(k + ksize*x))
+#define INDEX5(isize,jsize,ksize,xsize,i,j,k,x,y) i + isize*(j + jsize*(k + ksize*(x + xsize*y)))
 
-#define INDEX3(xsize,ysize,x,y,z) x + xsize*(y + ysize*z)
-//#define INDEX3(xsize,ysize,x,y,z) x + (y)*xsize + (z)*xsize*ysize
+//#define INDEX6(isize,jsize,ksize,xsize,ysize,i,j,k,x,y,z) i + isize*(j + jsize*(k + ksize*(x + xsize*(y + ysize*z))))
+//#define INDEX4_PADDED(isize,jsize,ksize,i,j,k,x) i + isize*(j + jsize*k) + x*NGLL3_PADDED
 
-#define INDEX4(xsize,ysize,zsize,x,y,z,i) x + xsize*(y + ysize*(z + zsize*i))
-//#define INDEX4(xsize,ysize,zsize,x,y,z,i) x + (y)*xsize + (z)*xsize*ysize + (i)*xsize*ysize*zsize
-
-#define INDEX5(xsize,ysize,zsize,isize,x,y,z,i,j) x + xsize*(y + ysize*(z + zsize*(i + isize*(j))))
-//#define INDEX5(xsize,ysize,zsize,isize,x,y,z,i,j) x + (y)*xsize + (z)*xsize*ysize + (i)*xsize*ysize*zsize + (j)*xsize*ysize*zsize*isize
-
-#define INDEX6(xsize,ysize,zsize,isize,jsize,x,y,z,i,j,k) x + xsize*(y + ysize*(z + zsize*(i + isize*(j + jsize*k))))
-
-#define INDEX4_PADDED(xsize,ysize,zsize,x,y,z,i) x + xsize*(y + ysize*z) + (i)*NGLL3_PADDED
-//#define INDEX4_PADDED(xsize,ysize,zsize,x,y,z,i) x + (y)*xsize + (z)*xsize*ysize + (i)*NGLL3_PADDED
-
 /* ----------------------------------------------------------------------------------------------- */
 
 // type of "working" variables: see also CUSTOM_REAL
@@ -625,10 +624,15 @@
   int* d_number_receiver_global;
   int* d_ispec_selected_rec;
   int* d_islice_selected_rec;
+
   int nrec_local;
+
   realw* d_station_seismo_field;
   realw* h_station_seismo_field;
 
+  realw* d_station_strain_field;
+  realw* h_station_strain_field;
+
   // adjoint receivers/sources
   int nadj_rec_local;
   realw* d_adj_sourcearrays;
@@ -731,6 +735,33 @@
   // noise sensitivity kernel
   realw* d_Sigma_kl;
 
+  // ------------------------------------------------------------------ //
+  // optimizations
+  // ------------------------------------------------------------------ //
+
+  // overlapped memcpy streams
+  cudaStream_t compute_stream;
+  cudaStream_t copy_stream;
+
+  // A buffer for mpi-send/recv, which is duplicated in fortran but is
+  // allocated with pinned memory to facilitate asynchronus device <->
+  // host memory transfers
+  // crust/mantle
+  float* h_send_accel_buffer_cm;
+  float* h_recv_accel_buffer_cm;
+  float* h_b_send_accel_buffer_cm;
+  float* h_b_recv_accel_buffer_cm;
+  // inner core
+  float* h_send_accel_buffer_ic;
+  float* h_recv_accel_buffer_ic;
+  float* h_b_send_accel_buffer_ic;
+  float* h_b_recv_accel_buffer_ic;
+  // outer core
+  float* h_send_accel_buffer_oc;
+  float* h_recv_accel_buffer_oc;
+  float* h_b_send_accel_buffer_oc;
+  float* h_b_recv_accel_buffer_oc;
+
 } Mesh;
 
 

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/prepare_mesh_constants_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/prepare_mesh_constants_cuda.cu	2014-01-06 21:22:48 UTC (rev 22990)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/prepare_mesh_constants_cuda.cu	2014-01-13 16:06:21 UTC (rev 22991)
@@ -195,7 +195,7 @@
       print_CUDA_error_if_any(cudaBindTexture(0, d_hprime_xx_ic_tex_ptr, mp->d_hprime_xx,
                                               &channelDesc, sizeof(realw)*(NGLL2)), 1106);
     #else
-      cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
+      cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<realw>();
       print_CUDA_error_if_any(cudaBindTexture(0, &d_hprime_xx_cm_tex, mp->d_hprime_xx,
                                               &channelDesc, sizeof(realw)*(NGLL2)), 1102);
       print_CUDA_error_if_any(cudaBindTexture(0, &d_hprime_xx_oc_tex, mp->d_hprime_xx,
@@ -282,10 +282,24 @@
     // for seismograms
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_station_seismo_field),
                                        NDIM*NGLL3*(mp->nrec_local)*sizeof(realw)),4015);
+    // for transfering values from GPU to CPU
+    if( GPU_ASYNC_COPY ){
+      // todo
+      // only pinned memory can handle memcpy calls asynchronuously
+      print_CUDA_error_if_any(cudaMallocHost((void**)&(mp->h_station_seismo_field),
+                                             NDIM*NGLL3*(mp->nrec_local)*sizeof(realw)),4015);
+    }else{
+      mp->h_station_seismo_field = (realw*) malloc( NDIM*NGLL3*(mp->nrec_local)*sizeof(realw) );
+      if( mp->h_station_seismo_field == NULL) exit_on_error("h_station_seismo_field not allocated \n");
+    }
 
-    mp->h_station_seismo_field = (realw*) malloc( NDIM*NGLL3*(mp->nrec_local)*sizeof(realw) );
-    if( mp->h_station_seismo_field == NULL) exit_on_error("h_station_seismo_field not allocated \n");
-
+    // for adjoint strain
+    if( mp->simulation_type == 2 ){
+      print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_station_strain_field),
+                                         NGLL3*(mp->nrec_local)*sizeof(realw)),4015);
+      mp->h_station_strain_field = (realw*) malloc( NGLL3*(mp->nrec_local)*sizeof(realw) );
+      if( mp->h_station_strain_field == NULL) exit_on_error("h_station_strain_field not allocated \n");
+    }
   }
   copy_todevice_int((void**)&mp->d_ispec_selected_rec,h_ispec_selected_rec,(*nrec));
 
@@ -318,12 +332,17 @@
     free(h_pre_computed_irec);
 
     // temporary array to prepare extracted source array values
-    mp->h_adj_sourcearrays_slice = (realw*) malloc( (mp->nadj_rec_local)*NDIM*NGLL3*sizeof(realw) );
-    if( mp->h_adj_sourcearrays_slice == NULL ) exit_on_error("h_adj_sourcearrays_slice not allocated\n");
+    if( GPU_ASYNC_COPY ){
+      // note: Allocate pinned buffers otherwise cudaMemcpyAsync() will behave like cudaMemcpy(), i.e. synchronuously.
+      print_CUDA_error_if_any(cudaMallocHost((void**)&(mp->h_adj_sourcearrays_slice),
+                                             (mp->nadj_rec_local)*NDIM*NGLL3*sizeof(realw)),6011);
+    }else{
+      mp->h_adj_sourcearrays_slice = (realw*) malloc( (mp->nadj_rec_local)*NDIM*NGLL3*sizeof(realw) );
+      if( mp->h_adj_sourcearrays_slice == NULL ) exit_on_error("h_adj_sourcearrays_slice not allocated\n");
+    }
 
     print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_adj_sourcearrays,
                                        (mp->nadj_rec_local)*NDIM*NGLL3*sizeof(realw)),6003);
-
   }
 
   // for rotation and new attenuation
@@ -335,6 +354,13 @@
   mp->two_omega_earth = 0.f;
   mp->b_two_omega_earth = 0.f;
 
+  // setup two streams, one for compute and one for host<->device memory copies
+  // uses pinned memory for asynchronuous data transfers
+  // compute stream
+  cudaStreamCreate(&mp->compute_stream);
+  // copy stream (needed to transfer mpi buffers)
+  cudaStreamCreate(&mp->copy_stream);
+
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
   exit_on_cuda_error("prepare_constants_device");
 #endif
@@ -362,14 +388,13 @@
   Mesh* mp = (Mesh*)(*Mesh_pointer_f);
 
   // arrays only needed when rotation is required
-  if( ! mp->rotation ){
-    exit_on_cuda_error("prepare_fields_rotation_device: rotation flag not properly initialized");
-  }
+  if( ! mp->rotation ){exit_on_error("prepare_fields_rotation_device: rotation flag not properly initialized");}
+
   // checks array size
   if( *NSPEC_OUTER_CORE_ROTATION != mp->NSPEC_OUTER_CORE){
     printf("error prepare_fields_rotation_device: rotation array has wrong size: %d instead of %d\n",
            *NSPEC_OUTER_CORE_ROTATION,mp->NSPEC_OUTER_CORE);
-    exit_on_cuda_error("prepare_fields_rotation_device: rotation array has wrong size");
+    exit_on_error("prepare_fields_rotation_device: rotation array has wrong size");
   }
 
   // rotation arrays (needed only for outer core region)
@@ -482,7 +507,7 @@
   Mesh* mp = (Mesh*)(*Mesh_pointer_f);
 
   // checks flag
-  if( ! mp->attenuation ){ exit_on_cuda_error("prepare_fields_attenuat_device attenuation not properly initialized"); }
+  if( ! mp->attenuation ){ exit_on_error("prepare_fields_attenuat_device attenuation not properly initialized"); }
 
   // crust_mantle
   R_size1 = N_SLS*NGLL3*mp->NSPEC_CRUST_MANTLE;
@@ -605,7 +630,7 @@
   Mesh* mp = (Mesh*)(*Mesh_pointer_f);
 
   // checks flag
-  if( ! mp->compute_and_store_strain ){ exit_on_cuda_error("prepare_fields_strain_device strain not properly initialized"); }
+  if( ! mp->compute_and_store_strain ){ exit_on_error("prepare_fields_strain_device strain not properly initialized"); }
 
   // crust_mantle
   R_size = NGLL3*mp->NSPEC_CRUST_MANTLE;
@@ -718,7 +743,7 @@
   Mesh* mp = (Mesh*)(*Mesh_pointer_f);
 
   // checks flag
-  if( ! mp->absorbing_conditions ){ exit_on_cuda_error("prepare_fields_absorb_device absorbing_conditions not properly initialized"); }
+  if( ! mp->absorbing_conditions ){ exit_on_error("prepare_fields_absorb_device absorbing_conditions not properly initialized"); }
 
   // crust_mantle
   mp->nspec2D_xmin_crust_mantle = *nspec2D_xmin_crust_mantle;
@@ -901,6 +926,7 @@
   TRACE("prepare_mpi_buffers_device");
 
   Mesh* mp = (Mesh*)(*Mesh_pointer_f);
+  int size_mpi_buffer;
 
   // prepares interprocess-edge exchange information
 
@@ -914,14 +940,29 @@
     // ibool entries (iglob indices) values on interface
     copy_todevice_int((void**)&mp->d_ibool_interfaces_crust_mantle,ibool_interfaces_crust_mantle,
                       (mp->num_interfaces_crust_mantle)*(mp->max_nibool_interfaces_cm));
+
+    size_mpi_buffer = NDIM*(mp->max_nibool_interfaces_cm)*(mp->num_interfaces_crust_mantle);
+
     // allocates mpi buffer for exchange with cpu
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_send_accel_buffer_crust_mantle),
-                                       NDIM*(mp->max_nibool_interfaces_cm)*(mp->num_interfaces_crust_mantle)*sizeof(realw)),4004);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_send_accel_buffer_crust_mantle),size_mpi_buffer*sizeof(realw)),4004);
     if( mp->simulation_type == 3){
-      print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_send_accel_buffer_crust_mantle),
-                                        NDIM*(mp->max_nibool_interfaces_cm)*(mp->num_interfaces_crust_mantle)*sizeof(realw)),4004);
+      print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_send_accel_buffer_crust_mantle),size_mpi_buffer*sizeof(realw)),4004);
     }
 
+    // asynchronuous MPI buffer
+    if( GPU_ASYNC_COPY ){
+      // note: Allocate pinned mpi-buffers.
+      //       MPI buffers use pinned memory allocated by cudaMallocHost, which
+      //       enables the use of asynchronous memory copies from host <-> device
+      // send buffer
+      print_CUDA_error_if_any(cudaMallocHost((void**)&(mp->h_send_accel_buffer_cm),sizeof(realw)* size_mpi_buffer ),8004);
+      // receive buffer
+      print_CUDA_error_if_any(cudaMallocHost((void**)&(mp->h_recv_accel_buffer_cm),sizeof(realw)* size_mpi_buffer ),8004);
+      if( mp->simulation_type == 3){
+        print_CUDA_error_if_any(cudaMallocHost((void**)&(mp->h_b_send_accel_buffer_cm),sizeof(realw)* size_mpi_buffer ),8004);
+        print_CUDA_error_if_any(cudaMallocHost((void**)&(mp->h_b_recv_accel_buffer_cm),sizeof(realw)* size_mpi_buffer ),8004);
+      }
+    }
   }
 
   // inner core mesh
@@ -934,14 +975,30 @@
     // ibool entries (iglob indices) values on interface
     copy_todevice_int((void**)&mp->d_ibool_interfaces_inner_core,ibool_interfaces_inner_core,
                       (mp->num_interfaces_inner_core)*(mp->max_nibool_interfaces_ic));
+
+    size_mpi_buffer = NDIM*(mp->max_nibool_interfaces_ic)*(mp->num_interfaces_inner_core);
+
     // allocates mpi buffer for exchange with cpu
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_send_accel_buffer_inner_core),
-                                       NDIM*(mp->max_nibool_interfaces_ic)*(mp->num_interfaces_inner_core)*sizeof(realw)),4004);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_send_accel_buffer_inner_core),size_mpi_buffer*sizeof(realw)),4004);
     if( mp->simulation_type == 3){
-      print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_send_accel_buffer_inner_core),
-                                        NDIM*(mp->max_nibool_interfaces_ic)*(mp->num_interfaces_inner_core)*sizeof(realw)),4004);
+      print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_send_accel_buffer_inner_core),size_mpi_buffer*sizeof(realw)),4004);
     }
 
+    // asynchronuous MPI buffer
+    if( GPU_ASYNC_COPY ){
+      // note: Allocate pinned mpi-buffers.
+      //       MPI buffers use pinned memory allocated by cudaMallocHost, which
+      //       enables the use of asynchronous memory copies from host <-> device
+      // send buffer
+      print_CUDA_error_if_any(cudaMallocHost((void**)&(mp->h_send_accel_buffer_ic),sizeof(realw)*size_mpi_buffer ),8004);
+      // receive buffer
+      print_CUDA_error_if_any(cudaMallocHost((void**)&(mp->h_recv_accel_buffer_ic),sizeof(realw)*size_mpi_buffer ),8004);
+      // adjoint
+      if( mp->simulation_type == 3){
+        print_CUDA_error_if_any(cudaMallocHost((void**)&(mp->h_b_send_accel_buffer_ic),sizeof(realw)*size_mpi_buffer ),8004);
+        print_CUDA_error_if_any(cudaMallocHost((void**)&(mp->h_b_recv_accel_buffer_ic),sizeof(realw)*size_mpi_buffer ),8004);
+      }
+    }
   }
 
   // outer core mesh
@@ -955,14 +1012,31 @@
     // ibool entries (iglob indices) values on interface
     copy_todevice_int((void**)&mp->d_ibool_interfaces_outer_core,ibool_interfaces_outer_core,
                       (mp->num_interfaces_outer_core)*(mp->max_nibool_interfaces_oc));
+
+    size_mpi_buffer = (mp->max_nibool_interfaces_oc)*(mp->num_interfaces_outer_core);
+
     // allocates mpi buffer for exchange with cpu
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_send_accel_buffer_outer_core),
-                                       (mp->max_nibool_interfaces_oc)*(mp->num_interfaces_outer_core)*sizeof(realw)),4004);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_send_accel_buffer_outer_core),size_mpi_buffer*sizeof(realw)),4004);
     if( mp->simulation_type == 3){
-      print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_send_accel_buffer_outer_core),
-                                        (mp->max_nibool_interfaces_oc)*(mp->num_interfaces_outer_core)*sizeof(realw)),4004);
+      print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_send_accel_buffer_outer_core),size_mpi_buffer*sizeof(realw)),4004);
     }
+
+    // asynchronuous MPI buffer
+    if( GPU_ASYNC_COPY ){
+      // note: Allocate pinned mpi-buffers.
+      //       MPI buffers use pinned memory allocated by cudaMallocHost, which
+      //       enables the use of asynchronous memory copies from host <-> device
+      // send buffer
+      print_CUDA_error_if_any(cudaMallocHost((void**)&(mp->h_send_accel_buffer_oc),sizeof(realw)*size_mpi_buffer ),8004);
+      // receive buffer
+      print_CUDA_error_if_any(cudaMallocHost((void**)&(mp->h_recv_accel_buffer_oc),sizeof(realw)*size_mpi_buffer ),8004);
+      if( mp->simulation_type == 3){
+        print_CUDA_error_if_any(cudaMallocHost((void**)&(mp->h_b_send_accel_buffer_oc),sizeof(realw)*size_mpi_buffer ),8004);
+        print_CUDA_error_if_any(cudaMallocHost((void**)&(mp->h_b_recv_accel_buffer_oc),sizeof(realw)*size_mpi_buffer ),8004);
+      }
+    }
   }
+
 }
 
 /* ----------------------------------------------------------------------------------------------- */
@@ -999,7 +1073,7 @@
                                        NDIM*NGLL2*(mp->nspec2D_top_crust_mantle)*sizeof(realw)),7005);
   }else{
     // for global mesh: each crust/mantle slice should have at top a free surface
-    exit_on_cuda_error("prepare_fields_noise_device NSPEC_TOP not properly initialized");
+    exit_on_error("prepare_fields_noise_device NSPEC_TOP not properly initialized");
   }
 
 
@@ -1057,7 +1131,7 @@
   mp->npoin_oceans = *npoin_oceans;
 
   // checks for global partitions, each slice must have a top surface with points on it
-  if( mp->npoin_oceans == 0 ){ exit_on_cuda_error("prepare_oceans_device has zero npoin_oceans"); }
+  if( mp->npoin_oceans == 0 ){ exit_on_error("prepare_oceans_device has zero npoin_oceans"); }
 
   // global point indices
   copy_todevice_int((void**)&mp->d_ibool_ocean_load,h_iglob_ocean_load,mp->npoin_oceans);
@@ -1373,7 +1447,7 @@
                                                 &channelDesc, sizeof(realw)*size), 4023);
       }
     #else
-      cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
+      cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<realw>();
       print_CUDA_error_if_any(cudaBindTexture(0, &d_displ_cm_tex, mp->d_displ_crust_mantle,
                                               &channelDesc, sizeof(realw)*size), 4021);
       print_CUDA_error_if_any(cudaBindTexture(0, &d_accel_cm_tex, mp->d_accel_crust_mantle,
@@ -1619,7 +1693,7 @@
                                                 &channelDesc, sizeof(realw)*size_glob), 5023);
       }
     #else
-      cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
+      cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<realw>();
       print_CUDA_error_if_any(cudaBindTexture(0, &d_displ_oc_tex, mp->d_displ_outer_core,
                                               &channelDesc, sizeof(realw)*size_glob), 5021);
       print_CUDA_error_if_any(cudaBindTexture(0, &d_accel_oc_tex, mp->d_accel_outer_core,
@@ -1866,7 +1940,7 @@
 
       }
     #else
-      cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
+      cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<realw>();
       print_CUDA_error_if_any(cudaBindTexture(0, &d_displ_ic_tex, mp->d_displ_inner_core,
                                               &channelDesc, sizeof(realw)*size), 6021);
       print_CUDA_error_if_any(cudaBindTexture(0, &d_accel_ic_tex, mp->d_accel_inner_core,
@@ -1949,6 +2023,9 @@
   // frees allocated memory arrays
   Mesh* mp = (Mesh*)(*Mesh_pointer_f);
 
+  // synchronizes device
+  synchronize_cuda();
+
   // frees memory on GPU
 
   //------------------------------------------
@@ -1968,14 +2045,26 @@
   if( mp->nrec_local > 0 ) {
     cudaFree(mp->d_number_receiver_global);
     cudaFree(mp->d_station_seismo_field);
-    free(mp->h_station_seismo_field);
+    if( GPU_ASYNC_COPY){
+      cudaFreeHost(mp->h_station_seismo_field);
+    }else{
+      free(mp->h_station_seismo_field);
+    }
+    if( mp->simulation_type == 2 ){
+      cudaFree(mp->d_station_strain_field);
+      free(mp->h_station_strain_field);
+    }
   }
   cudaFree(mp->d_ispec_selected_rec);
 
   if( mp->nadj_rec_local > 0 ){
     cudaFree(mp->d_adj_sourcearrays);
     cudaFree(mp->d_pre_computed_irec);
-    free(mp->h_adj_sourcearrays_slice);
+    if( GPU_ASYNC_COPY){
+      cudaFreeHost(mp->h_adj_sourcearrays_slice);
+    }else{
+      free(mp->h_adj_sourcearrays_slice);
+    }
   }
 
   //------------------------------------------
@@ -2164,18 +2253,42 @@
     cudaFree(mp->d_ibool_interfaces_crust_mantle);
     cudaFree(mp->d_send_accel_buffer_crust_mantle);
     if( mp->simulation_type == 3 ) cudaFree(mp->d_b_send_accel_buffer_crust_mantle);
+    if( GPU_ASYNC_COPY){
+      cudaFreeHost(mp->h_send_accel_buffer_cm);
+      cudaFreeHost(mp->h_recv_accel_buffer_cm);
+      if( mp->simulation_type == 3 ){
+        cudaFreeHost(mp->h_b_send_accel_buffer_cm);
+        cudaFreeHost(mp->h_b_recv_accel_buffer_cm);
+      }
+    }
   }
   if( mp->num_interfaces_inner_core > 0 ){
     cudaFree(mp->d_nibool_interfaces_inner_core);
     cudaFree(mp->d_ibool_interfaces_inner_core);
     cudaFree(mp->d_send_accel_buffer_inner_core);
     if( mp->simulation_type == 3 ) cudaFree(mp->d_b_send_accel_buffer_inner_core);
+    if( GPU_ASYNC_COPY){
+      cudaFreeHost(mp->h_send_accel_buffer_ic);
+      cudaFreeHost(mp->h_recv_accel_buffer_ic);
+      if( mp->simulation_type == 3 ){
+        cudaFreeHost(mp->h_b_send_accel_buffer_ic);
+        cudaFreeHost(mp->h_b_recv_accel_buffer_ic);
+      }
+    }
   }
   if( mp->num_interfaces_outer_core > 0 ){
     cudaFree(mp->d_nibool_interfaces_outer_core);
     cudaFree(mp->d_ibool_interfaces_outer_core);
     cudaFree(mp->d_send_accel_buffer_outer_core);
     if( mp->simulation_type == 3 ) cudaFree(mp->d_b_send_accel_buffer_outer_core);
+    if( GPU_ASYNC_COPY){
+      cudaFreeHost(mp->h_send_accel_buffer_oc);
+      cudaFreeHost(mp->h_recv_accel_buffer_oc);
+      if( mp->simulation_type == 3 ){
+        cudaFreeHost(mp->h_b_send_accel_buffer_oc);
+        cudaFreeHost(mp->h_b_recv_accel_buffer_oc);
+      }
+    }
   }
 
   //------------------------------------------
@@ -2393,12 +2506,17 @@
     cudaFree(mp->d_normal_ocean_load);
   }
 
+  // cleans up asychronuous streams
+  cudaStreamDestroy(mp->compute_stream);
+  cudaStreamDestroy(mp->copy_stream);
+
+  // synchronizes device
+  synchronize_cuda();
+
   // releases previous contexts
 #if CUDA_VERSION < 4000
-  cudaThreadSynchronize();
   cudaThreadExit();
 #else
-  cudaDeviceSynchronize();
   cudaDeviceReset();
 #endif
 

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/specfem3D_gpu_cuda_method_stubs.c
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/specfem3D_gpu_cuda_method_stubs.c	2014-01-06 21:22:48 UTC (rev 22990)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/specfem3D_gpu_cuda_method_stubs.c	2014-01-13 16:06:21 UTC (rev 22991)
@@ -1,4 +1,4 @@
-/*
+/* 
 !=====================================================================
 !
 !          S p e c f e m 3 D  G l o b e  V e r s i o n  6 . 0
@@ -34,97 +34,110 @@
 
 typedef float realw;
 
+ 
 
-
 //
 // src/cuda/assemble_MPI_scalar_cuda.cu
 //
 
 void FC_FUNC_(transfer_boun_pot_from_device,
               TRANSFER_BOUN_POT_FROM_DEVICE)(long* Mesh_pointer_f,
-                                             realw* send_potential_dot_dot_buffer,
-                                             int* FORWARD_OR_ADJOINT){}
+                                             realw* send_buffer,
+                                             int* FORWARD_OR_ADJOINT){} 
 
 void FC_FUNC_(transfer_asmbl_pot_to_device,
               TRANSFER_ASMBL_POT_TO_DEVICE)(long* Mesh_pointer,
                                             realw* buffer_recv_scalar,
-                                            int* FORWARD_OR_ADJOINT) {}
+                                            int* FORWARD_OR_ADJOINT) {} 
 
 
 //
 // src/cuda/assemble_MPI_vector_cuda.cu
 //
 
-void FC_FUNC_(transfer_boun_accel_from_device,
-              TRANSFER_BOUN_ACCEL_FROM_DEVICE)(long* Mesh_pointer_f,
-                                               realw* send_accel_buffer,
-                                               int* IREGION,
-                                               int* FORWARD_OR_ADJOINT){}
+void FC_FUNC_(transfer_boun_from_device,
+              TRANSFER_BOUN_FROM_DEVICE)(long* Mesh_pointer_f,
+                                         realw* send_accel_buffer,
+                                         int* IREGION,
+                                         int* FORWARD_OR_ADJOINT){} 
 
 void FC_FUNC_(transfer_asmbl_accel_to_device,
               TRANSFER_ASMBL_ACCEL_TO_DEVICE)(long* Mesh_pointer,
                                               realw* buffer_recv_vector,
                                               int* IREGION,
-                                              int* FORWARD_OR_ADJOINT) {}
+                                              int* FORWARD_OR_ADJOINT) {} 
 
+void FC_FUNC_(transfer_buffer_to_device_async,
+              TRANSFER_BUFFER_TO_DEVICE_ASYNC)(long* Mesh_pointer,
+                                             realw* buffer,
+                                             int* IREGION,
+                                             int* FORWARD_OR_ADJOINT) {} 
 
+void FC_FUNC_(sync_copy_from_device,
+              SYNC_copy_FROM_DEVICE)(long* Mesh_pointer,
+                                     int* iphase,
+                                     realw* send_buffer,
+                                     int* IREGION,
+                                     int* FORWARD_OR_ADJOINT) {} 
+
+
 //
 // src/cuda/check_fields_cuda.cu
 //
 
 void FC_FUNC_(pause_for_debug,
-              PAUSE_FOR_DEBUG)() {}
+              PAUSE_FOR_DEBUG)() {} 
 
 void FC_FUNC_(output_free_device_memory,
-              OUTPUT_FREE_DEVICE_MEMORY)(int* myrank) {}
+              OUTPUT_FREE_DEVICE_MEMORY)(int* myrank) {} 
 
 void FC_FUNC_(get_free_device_memory,
-              get_FREE_DEVICE_MEMORY)(realw* free, realw* used, realw* total ) {}
+              get_FREE_DEVICE_MEMORY)(realw* free, realw* used, realw* total ) {} 
 
 void FC_FUNC_(check_norm_acoustic_from_device,
               CHECK_NORM_ACOUSTIC_FROM_DEVICE)(realw* norm,
                                                   long* Mesh_pointer_f,
-                                                  int* FORWARD_OR_ADJOINT) {}
+                                                  int* FORWARD_OR_ADJOINT) {} 
 
 void FC_FUNC_(check_norm_elastic_from_device,
               CHECK_NORM_ELASTIC_FROM_DEVICE)(realw* norm,
                                               long* Mesh_pointer_f,
-                                              int* FORWARD_OR_ADJOINT) {}
+                                              int* FORWARD_OR_ADJOINT) {} 
 
 void FC_FUNC_(check_norm_strain_from_device,
               CHECK_NORM_STRAIN_FROM_DEVICE)(realw* strain_norm,
                                              realw* strain_norm2,
-                                             long* Mesh_pointer_f) {}
+                                             long* Mesh_pointer_f) {} 
 
  void FC_FUNC_(check_max_norm_displ_gpu,
- CHECK_MAX_NORM_DISPL_GPU)(int* size, realw* displ,long* Mesh_pointer_f,int* announceID) {}
+ CHECK_MAX_NORM_DISPL_GPU)(int* size, realw* displ,long* Mesh_pointer_f,int* announceID) {} 
 
  void FC_FUNC_(check_max_norm_vector,
- CHECK_MAX_NORM_VECTOR)(int* size, realw* vector1, int* announceID) {}
+ CHECK_MAX_NORM_VECTOR)(int* size, realw* vector1, int* announceID) {} 
 
  void FC_FUNC_(check_max_norm_displ,
- CHECK_MAX_NORM_DISPL)(int* size, realw* displ, int* announceID) {}
+ CHECK_MAX_NORM_DISPL)(int* size, realw* displ, int* announceID) {} 
 
  void FC_FUNC_(check_max_norm_b_displ_gpu,
- CHECK_MAX_NORM_B_DISPL_GPU)(int* size, realw* b_displ,long* Mesh_pointer_f,int* announceID) {}
+ CHECK_MAX_NORM_B_DISPL_GPU)(int* size, realw* b_displ,long* Mesh_pointer_f,int* announceID) {} 
 
  void FC_FUNC_(check_max_norm_b_accel_gpu,
- CHECK_MAX_NORM_B_ACCEL_GPU)(int* size, realw* b_accel,long* Mesh_pointer_f,int* announceID) {}
+ CHECK_MAX_NORM_B_ACCEL_GPU)(int* size, realw* b_accel,long* Mesh_pointer_f,int* announceID) {} 
 
  void FC_FUNC_(check_max_norm_b_veloc_gpu,
- CHECK_MAX_NORM_B_VELOC_GPU)(int* size, realw* b_veloc,long* Mesh_pointer_f,int* announceID) {}
+ CHECK_MAX_NORM_B_VELOC_GPU)(int* size, realw* b_veloc,long* Mesh_pointer_f,int* announceID) {} 
 
  void FC_FUNC_(check_max_norm_b_displ,
- CHECK_MAX_NORM_B_DISPL)(int* size, realw* b_displ,int* announceID) {}
+ CHECK_MAX_NORM_B_DISPL)(int* size, realw* b_displ,int* announceID) {} 
 
  void FC_FUNC_(check_max_norm_b_accel,
- CHECK_MAX_NORM_B_ACCEL)(int* size, realw* b_accel,int* announceID) {}
+ CHECK_MAX_NORM_B_ACCEL)(int* size, realw* b_accel,int* announceID) {} 
 
  void FC_FUNC_(check_error_vectors,
- CHECK_ERROR_VECTORS)(int* sizef, realw* vector1,realw* vector2) {}
+ CHECK_ERROR_VECTORS)(int* sizef, realw* vector1,realw* vector2) {} 
 
  void FC_FUNC_(get_max_accel,
- GET_MAX_ACCEL)(int* itf,int* sizef,long* Mesh_pointer) {}
+ GET_MAX_ACCEL)(int* itf,int* sizef,long* Mesh_pointer) {} 
 
 
 //
@@ -134,45 +147,53 @@
 void FC_FUNC_(compute_add_sources_cuda,
               COMPUTE_ADD_SOURCES_CUDA)(long* Mesh_pointer_f,
                                         int* NSOURCESf,
-                                        double* h_stf_pre_compute) {}
+                                        double* h_stf_pre_compute) {} 
 
 void FC_FUNC_(compute_add_sources_backward_cuda,
               COMPUTE_ADD_SOURCES_BACKWARD_CUDA)(long* Mesh_pointer_f,
                                                  int* NSOURCESf,
-                                                 double* h_stf_pre_compute) {}
+                                                 double* h_stf_pre_compute) {} 
 
 void FC_FUNC_(compute_add_sources_adjoint_cuda,
               COMPUTE_ADD_SOURCES_ADJOINT_CUDA)(long* Mesh_pointer,
-                                                int* nrec,
-                                                realw* h_adj_sourcearrays,
-                                                int* h_islice_selected_rec,
-                                                int* h_ispec_selected_rec,
-                                                int* time_index) {}
+                                                int* h_nrec) {} 
 
+void FC_FUNC_(transfer_adj_to_device,
+              TRANSFER_ADJ_TO_DEVICE)(long* Mesh_pointer,
+                                      int* h_nrec,
+                                      realw* h_adj_sourcearrays,
+                                      int* h_islice_selected_rec) {} 
 
+void FC_FUNC_(transfer_adj_to_device_async,
+              TRANSFER_ADJ_TO_DEVICE_ASYNC)(long* Mesh_pointer,
+                                            int* h_nrec,
+                                            realw* h_adj_sourcearrays,
+                                            int* h_islice_selected_rec) {} 
+
+
 //
 // src/cuda/compute_coupling_cuda.cu
 //
 
 void FC_FUNC_(compute_coupling_fluid_cmb_cuda,
               COMPUTE_COUPLING_FLUID_CMB_CUDA)(long* Mesh_pointer_f,
-                                               int* FORWARD_OR_ADJOINT) {}
+                                               int* FORWARD_OR_ADJOINT) {} 
 
 void FC_FUNC_(compute_coupling_fluid_icb_cuda,
               COMPUTE_COUPLING_FLUID_ICB_CUDA)(long* Mesh_pointer_f,
-                                               int* FORWARD_OR_ADJOINT) {}
+                                               int* FORWARD_OR_ADJOINT) {} 
 
 void FC_FUNC_(compute_coupling_cmb_fluid_cuda,
               COMPUTE_COUPLING_CMB_FLUID_CUDA)(long* Mesh_pointer_f,
-                                               int* FORWARD_OR_ADJOINT) {}
+                                               int* FORWARD_OR_ADJOINT) {} 
 
 void FC_FUNC_(compute_coupling_icb_fluid_cuda,
               COMPUTE_COUPLING_ICB_FLUID_CUDA)(long* Mesh_pointer_f,
-                                               int* FORWARD_OR_ADJOINT) {}
+                                               int* FORWARD_OR_ADJOINT) {} 
 
 void FC_FUNC_(compute_coupling_ocean_cuda,
               COMPUTE_COUPLING_OCEAN_CUDA)(long* Mesh_pointer_f,
-                                           int* FORWARD_OR_ADJOINT) {}
+                                           int* FORWARD_OR_ADJOINT) {} 
 
 
 //
@@ -182,7 +203,7 @@
 void FC_FUNC_(compute_forces_crust_mantle_cuda,
               COMPUTE_FORCES_CRUST_MANTLE_CUDA)(long* Mesh_pointer_f,
                                                 int* iphase,
-                                                int* FORWARD_OR_ADJOINT_f) {}
+                                                int* FORWARD_OR_ADJOINT_f) {} 
 
 
 //
@@ -192,7 +213,7 @@
 void FC_FUNC_(compute_forces_inner_core_cuda,
               COMPUTE_FORCES_INNER_CORE_CUDA)(long* Mesh_pointer_f,
                                               int* iphase,
-                                              int* FORWARD_OR_ADJOINT_f) {}
+                                              int* FORWARD_OR_ADJOINT_f) {} 
 
 
 //
@@ -203,7 +224,7 @@
               COMPUTE_FORCES_OUTER_CORE_CUDA)(long* Mesh_pointer_f,
                                               int* iphase,
                                               realw* time_f,
-                                              int* FORWARD_OR_ADJOINT_f) {}
+                                              int* FORWARD_OR_ADJOINT_f) {} 
 
 
 //
@@ -211,22 +232,22 @@
 //
 
 void FC_FUNC_(compute_kernels_cm_cuda,
-              COMPUTE_KERNELS_CM_CUDA)(long* Mesh_pointer,realw* deltat_f) {}
+              COMPUTE_KERNELS_CM_CUDA)(long* Mesh_pointer,realw* deltat_f) {} 
 
 void FC_FUNC_(compute_kernels_ic_cuda,
-              COMPUTE_KERNELS_IC_CUDA)(long* Mesh_pointer,realw* deltat_f) {}
+              COMPUTE_KERNELS_IC_CUDA)(long* Mesh_pointer,realw* deltat_f) {} 
 
 void FC_FUNC_(compute_kernels_oc_cuda,
-              COMPUTE_KERNELS_OC_CUDA)(long* Mesh_pointer,realw* deltat_f) {}
+              COMPUTE_KERNELS_OC_CUDA)(long* Mesh_pointer,realw* deltat_f) {} 
 
 void FC_FUNC_(compute_kernels_strgth_noise_cu,
               COMPUTE_KERNELS_STRGTH_NOISE_CU)(long* Mesh_pointer,
                                                realw* h_noise_surface_movie,
-                                               realw* deltat_f) {}
+                                               realw* deltat_f) {} 
 
 void FC_FUNC_(compute_kernels_hess_cuda,
               COMPUTE_KERNELS_HESS_CUDA)(long* Mesh_pointer,
-                                         realw* deltat_f) {}
+                                         realw* deltat_f) {} 
 
 
 //
@@ -236,16 +257,16 @@
 void FC_FUNC_(compute_stacey_acoustic_cuda,
               COMPUTE_STACEY_ACOUSTIC_CUDA)(long* Mesh_pointer_f,
                                             realw* absorb_potential,
-                                            int* itype) {}
+                                            int* itype) {} 
 
 void FC_FUNC_(compute_stacey_acoustic_backward_cuda,
               COMPUTE_STACEY_ACOUSTIC_BACKWARD_CUDA)(long* Mesh_pointer_f,
                                                      realw* absorb_potential,
-                                                     int* itype) {}
+                                                     int* itype) {} 
 
 void FC_FUNC_(compute_stacey_acoustic_undoatt_cuda,
               COMPUTE_STACEY_ACOUSTIC_UNDOATT_CUDA)(long* Mesh_pointer_f,
-                                                    int* itype) {}
+                                                    int* itype) {} 
 
 
 //
@@ -255,16 +276,16 @@
 void FC_FUNC_(compute_stacey_elastic_cuda,
               COMPUTE_STACEY_ELASTIC_CUDA)(long* Mesh_pointer_f,
                                            realw* absorb_field,
-                                           int* itype) {}
+                                           int* itype) {} 
 
 void FC_FUNC_(compute_stacey_elastic_backward_cuda,
               COMPUTE_STACEY_ELASTIC_BACKWARD_CUDA)(long* Mesh_pointer_f,
                                                     realw* absorb_field,
-                                                    int* itype) {}
+                                                    int* itype) {} 
 
 void FC_FUNC_(compute_stacey_elastic_undoatt_cuda,
               COMPUTE_STACEY_ELASTIC_UNDOATT_CUDA)(long* Mesh_pointer_f,
-                                                   int* itype) {}
+                                                   int* itype) {} 
 
 
 //
@@ -272,39 +293,39 @@
 //
 
 void FC_FUNC_(initialize_cuda_device,
-              INITIALIZE_CUDA_DEVICE)(int* myrank_f,int* ncuda_devices) {
+              INITIALIZE_CUDA_DEVICE)(int* myrank_f,int* ncuda_devices) { 
  fprintf(stderr,"ERROR: GPU_MODE enabled without GPU/CUDA Support. To enable GPU support, reconfigure with --with-cuda flag.\n");
  exit(1);
-}
+} 
 
 
 //
 // src/cuda/noise_tomography_cuda.cu
 //
 
-void FC_FUNC_(fortranflush,FORTRANFLUSH)(int* rank){}
+void FC_FUNC_(fortranflush,FORTRANFLUSH)(int* rank){} 
 
-void FC_FUNC_(fortranprint,FORTRANPRINT)(int* id) {}
+void FC_FUNC_(fortranprint,FORTRANPRINT)(int* id) {} 
 
-void FC_FUNC_(fortranprintf,FORTRANPRINTF)(realw* val) {}
+void FC_FUNC_(fortranprintf,FORTRANPRINTF)(realw* val) {} 
 
-void FC_FUNC_(fortranprintd,FORTRANPRINTD)(double* val) {}
+void FC_FUNC_(fortranprintd,FORTRANPRINTD)(double* val) {} 
 
-void FC_FUNC_(make_displ_rand,MAKE_DISPL_RAND)(long* Mesh_pointer_f,realw* h_displ) {}
+void FC_FUNC_(make_displ_rand,MAKE_DISPL_RAND)(long* Mesh_pointer_f,realw* h_displ) {} 
 
 void FC_FUNC_(noise_transfer_surface_to_host,
               NOISE_TRANSFER_SURFACE_TO_HOST)(long* Mesh_pointer_f,
-                                              realw* h_noise_surface_movie) {}
+                                              realw* h_noise_surface_movie) {} 
 
 void FC_FUNC_(noise_add_source_master_rec_cu,
               NOISE_ADD_SOURCE_MASTER_REC_CU)(long* Mesh_pointer_f,
                                               int* it_f,
                                               int* irec_master_noise_f,
-                                              int* islice_selected_rec) {}
+                                              int* islice_selected_rec) {} 
 
 void FC_FUNC_(noise_add_surface_movie_cuda,
               NOISE_ADD_SURFACE_MOVIE_CUDA)(long* Mesh_pointer_f,
-                                            realw* h_noise_surface_movie) {}
+                                            realw* h_noise_surface_movie) {} 
 
 
 //
@@ -339,7 +360,7 @@
                                         int* SAVE_BOUNDARY_MESH_f,
                                         int* USE_MESH_COLORING_GPU_f,
                                         int* ANISOTROPIC_KL_f,int* APPROXIMATE_HESS_KL_f,
-                                        realw* deltat_f,realw* b_deltat_f) {}
+                                        realw* deltat_f,realw* b_deltat_f) {} 
 
 void FC_FUNC_(prepare_fields_rotation_device,
               PREPARE_FIELDS_ROTATION_DEVICE)(long* Mesh_pointer_f,
@@ -349,7 +370,7 @@
                                               realw* b_two_omega_earth_f,
                                               realw* b_A_array_rotation,
                                               realw* b_B_array_rotation,
-                                              int* NSPEC_OUTER_CORE_ROTATION) {}
+                                              int* NSPEC_OUTER_CORE_ROTATION) {} 
 
 void FC_FUNC_(prepare_fields_gravity_device,
               PREPARE_FIELDS_gravity_DEVICE)(long* Mesh_pointer_f,
@@ -363,7 +384,7 @@
                                              realw* minus_g_icb,
                                              realw* minus_g_cmb,
                                              double* RHO_BOTTOM_OC,
-                                             double* RHO_TOP_OC) {}
+                                             double* RHO_TOP_OC) {} 
 
 void FC_FUNC_(prepare_fields_attenuat_device,
               PREPARE_FIELDS_ATTENUAT_DEVICE)(long* Mesh_pointer_f,
@@ -392,7 +413,7 @@
                                                  realw* factor_common_inner_core,
                                                  realw* one_minus_sum_beta_inner_core,
                                                  realw* alphaval,realw* betaval,realw* gammaval,
-                                                 realw* b_alphaval,realw* b_betaval,realw* b_gammaval) {}
+                                                 realw* b_alphaval,realw* b_betaval,realw* b_gammaval) {} 
 
 void FC_FUNC_(prepare_fields_strain_device,
               PREPARE_FIELDS_STRAIN_DEVICE)(long* Mesh_pointer_f,
@@ -419,7 +440,7 @@
                                             realw* b_epsilondev_xz_inner_core,
                                             realw* b_epsilondev_yz_inner_core,
                                             realw* eps_trace_over_3_inner_core,
-                                            realw* b_eps_trace_over_3_inner_core) {}
+                                            realw* b_eps_trace_over_3_inner_core) {} 
 
 void FC_FUNC_(prepare_fields_absorb_device,
               PREPARE_FIELDS_ABSORB_DEVICE)(long* Mesh_pointer_f,
@@ -448,7 +469,7 @@
                                             int* ibelm_ymin_outer_core,int* ibelm_ymax_outer_core,
                                             realw* jacobian2D_xmin_outer_core, realw* jacobian2D_xmax_outer_core,
                                             realw* jacobian2D_ymin_outer_core, realw* jacobian2D_ymax_outer_core,
-                                            realw* vp_outer_core) {}
+                                            realw* vp_outer_core) {} 
 
 void FC_FUNC_(prepare_mpi_buffers_device,
               PREPARE_MPI_BUFFERS_DEVICE)(long* Mesh_pointer_f,
@@ -463,7 +484,7 @@
                                           int* num_interfaces_outer_core,
                                           int* max_nibool_interfaces_oc,
                                           int* nibool_interfaces_outer_core,
-                                          int* ibool_interfaces_outer_core){}
+                                          int* ibool_interfaces_outer_core){} 
 
 void FC_FUNC_(prepare_fields_noise_device,
               PREPARE_FIELDS_NOISE_DEVICE)(long* Mesh_pointer_f,
@@ -475,14 +496,14 @@
                                            realw* normal_y_noise,
                                            realw* normal_z_noise,
                                            realw* mask_noise,
-                                           realw* jacobian2D_top_crust_mantle) {}
+                                           realw* jacobian2D_top_crust_mantle) {} 
 
 void FC_FUNC_(prepare_oceans_device,
               PREPARE_OCEANS_DEVICE)(long* Mesh_pointer_f,
                                      int* npoin_oceans,
                                      int* h_iglob_ocean_load,
                                      realw* h_rmass_ocean_load_selected,
-                                     realw* h_normal_ocean_load) {}
+                                     realw* h_normal_ocean_load) {} 
 
 void FC_FUNC_(prepare_crust_mantle_device,
              PREPARE_CRUST_MANTLE_DEVICE)(long* Mesh_pointer_f,
@@ -514,7 +535,7 @@
                                           int* NCHUNKS_VAL,
                                           int* num_colors_outer,
                                           int* num_colors_inner,
-                                          int* num_elem_colors) {}
+                                          int* num_elem_colors) {} 
 
 void FC_FUNC_(prepare_outer_core_device,
               PREPARE_OUTER_CORE_DEVICE)(long* Mesh_pointer_f,
@@ -539,7 +560,7 @@
                                          int* h_ibelm_bottom_outer_core,
                                          int* num_colors_outer,
                                          int* num_colors_inner,
-                                         int* num_elem_colors) {}
+                                         int* num_elem_colors) {} 
 
 void FC_FUNC_(prepare_inner_core_device,
               PREPARE_INNER_CORE_DEVICE)(long* Mesh_pointer_f,
@@ -562,11 +583,11 @@
                                          int* h_ibelm_top_inner_core,
                                          int* num_colors_outer,
                                          int* num_colors_inner,
-                                         int* num_elem_colors) {}
+                                         int* num_elem_colors) {} 
 
 void FC_FUNC_(prepare_cleanup_device,
               PREPARE_CLEANUP_DEVICE)(long* Mesh_pointer_f,
-              int* NCHUNKS_VAL) {}
+              int* NCHUNKS_VAL) {} 
 
 
 //
@@ -574,88 +595,88 @@
 //
 
 void FC_FUNC_(transfer_fields_cm_to_device,
-              TRANSFER_FIELDS_CM_TO_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) {}
+              TRANSFER_FIELDS_CM_TO_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) {} 
 
 void FC_FUNC_(transfer_fields_ic_to_device,
-              TRANSFER_FIELDS_IC_TO_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) {}
+              TRANSFER_FIELDS_IC_TO_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) {} 
 
 void FC_FUNC_(transfer_fields_oc_to_device,
-              TRANSFER_FIELDS_OC_TO_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) {}
+              TRANSFER_FIELDS_OC_TO_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) {} 
 
 void FC_FUNC_(transfer_b_fields_cm_to_device,
               TRANSFER_FIELDS_B_CM_TO_DEVICE)(int* size, realw* b_displ, realw* b_veloc, realw* b_accel,
-                                              long* Mesh_pointer_f) {}
+                                              long* Mesh_pointer_f) {} 
 
 void FC_FUNC_(transfer_b_fields_ic_to_device,
               TRANSFER_FIELDS_B_IC_TO_DEVICE)(int* size, realw* b_displ, realw* b_veloc, realw* b_accel,
-                                              long* Mesh_pointer_f) {}
+                                              long* Mesh_pointer_f) {} 
 
 void FC_FUNC_(transfer_b_fields_oc_to_device,
               TRANSFER_FIELDS_B_OC_TO_DEVICE)(int* size, realw* b_displ, realw* b_veloc, realw* b_accel,
-                                              long* Mesh_pointer_f) {}
+                                              long* Mesh_pointer_f) {} 
 
 void FC_FUNC_(transfer_fields_cm_from_device,
-              TRANSFER_FIELDS_CM_FROM_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) {}
+              TRANSFER_FIELDS_CM_FROM_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) {} 
 
 void FC_FUNC_(transfer_fields_ic_from_device,
-              TRANSFER_FIELDS_IC_FROM_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) {}
+              TRANSFER_FIELDS_IC_FROM_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) {} 
 
 void FC_FUNC_(transfer_fields_oc_from_device,
-              TRANSFER_FIELDS_OC_FROM_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) {}
+              TRANSFER_FIELDS_OC_FROM_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) {} 
 
 void FC_FUNC_(transfer_b_fields_cm_from_device,
               TRANSFER_B_FIELDS_CM_FROM_DEVICE)(int* size, realw* b_displ, realw* b_veloc, realw* b_accel,
-                                                long* Mesh_pointer_f) {}
+                                                long* Mesh_pointer_f) {} 
 
 void FC_FUNC_(transfer_b_fields_ic_from_device,
               TRANSFER_B_FIELDS_IC_FROM_DEVICE)(int* size, realw* b_displ, realw* b_veloc, realw* b_accel,
-                                                long* Mesh_pointer_f) {}
+                                                long* Mesh_pointer_f) {} 
 
 void FC_FUNC_(transfer_b_fields_oc_from_device,
               TRANSFER_B_FIELDS_OC_FROM_DEVICE)(int* size, realw* b_displ, realw* b_veloc, realw* b_accel,
-                                                long* Mesh_pointer_f) {}
+                                                long* Mesh_pointer_f) {} 
 
 void FC_FUNC_(transfer_displ_cm_from_device,
-              TRANSFER_DISPL_CM_FROM_DEVICE)(int* size, realw* displ, long* Mesh_pointer_f) {}
+              TRANSFER_DISPL_CM_FROM_DEVICE)(int* size, realw* displ, long* Mesh_pointer_f) {} 
 
 void FC_FUNC_(transfer_b_displ_cm_from_device,
-              TRANSFER_B_DISPL_CM_FROM_DEVICE)(int* size, realw* displ, long* Mesh_pointer_f) {}
+              TRANSFER_B_DISPL_CM_FROM_DEVICE)(int* size, realw* displ, long* Mesh_pointer_f) {} 
 
 void FC_FUNC_(transfer_displ_ic_from_device,
-              TRANSFER_DISPL_IC_FROM_DEVICE)(int* size, realw* displ, long* Mesh_pointer_f) {}
+              TRANSFER_DISPL_IC_FROM_DEVICE)(int* size, realw* displ, long* Mesh_pointer_f) {} 
 
 void FC_FUNC_(transfer_b_displ_ic_from_device,
-              TRANSFER_B_DISPL_IC_FROM_DEVICE)(int* size, realw* displ, long* Mesh_pointer_f) {}
+              TRANSFER_B_DISPL_IC_FROM_DEVICE)(int* size, realw* displ, long* Mesh_pointer_f) {} 
 
 void FC_FUNC_(transfer_displ_oc_from_device,
-              TRANSFER_DISPL_OC_FROM_DEVICE)(int* size, realw* displ, long* Mesh_pointer_f) {}
+              TRANSFER_DISPL_OC_FROM_DEVICE)(int* size, realw* displ, long* Mesh_pointer_f) {} 
 
 void FC_FUNC_(transfer_b_displ_oc_from_device,
-              TRANSFER_B_DISPL_OC_FROM_DEVICE)(int* size, realw* displ, long* Mesh_pointer_f) {}
+              TRANSFER_B_DISPL_OC_FROM_DEVICE)(int* size, realw* displ, long* Mesh_pointer_f) {} 
 
 void FC_FUNC_(transfer_veloc_cm_from_device,
-              TRANSFER_VELOC_CM_FROM_DEVICE)(int* size, realw* veloc, long* Mesh_pointer_f) {}
+              TRANSFER_VELOC_CM_FROM_DEVICE)(int* size, realw* veloc, long* Mesh_pointer_f) {} 
 
 void FC_FUNC_(transfer_veloc_ic_from_device,
-              TRANSFER_VELOC_IC_FROM_DEVICE)(int* size, realw* veloc, long* Mesh_pointer_f) {}
+              TRANSFER_VELOC_IC_FROM_DEVICE)(int* size, realw* veloc, long* Mesh_pointer_f) {} 
 
 void FC_FUNC_(transfer_veloc_oc_from_device,
-              TRANSFER_VELOC_OC_FROM_DEVICE)(int* size, realw* veloc, long* Mesh_pointer_f) {}
+              TRANSFER_VELOC_OC_FROM_DEVICE)(int* size, realw* veloc, long* Mesh_pointer_f) {} 
 
 void FC_FUNC_(transfer_accel_cm_to_device,
-              TRANSFER_ACCEL_CM_TO_DEVICE)(int* size, realw* accel,long* Mesh_pointer_f) {}
+              TRANSFER_ACCEL_CM_TO_DEVICE)(int* size, realw* accel,long* Mesh_pointer_f) {} 
 
 void FC_FUNC_(transfer_accel_cm_from_device,
-              TRANSFER_ACCEL_CM_FROM_DEVICE)(int* size, realw* accel,long* Mesh_pointer_f) {}
+              TRANSFER_ACCEL_CM_FROM_DEVICE)(int* size, realw* accel,long* Mesh_pointer_f) {} 
 
 void FC_FUNC_(transfer_b_accel_cm_from_device,
-              TRANSFER_B_ACCEL_CM_FROM_DEVICE)(int* size, realw* b_accel,long* Mesh_pointer_f) {}
+              TRANSFER_B_ACCEL_CM_FROM_DEVICE)(int* size, realw* b_accel,long* Mesh_pointer_f) {} 
 
 void FC_FUNC_(transfer_accel_ic_from_device,
-              TRANSFER_ACCEL_IC_FROM_DEVICE)(int* size, realw* accel,long* Mesh_pointer_f) {}
+              TRANSFER_ACCEL_IC_FROM_DEVICE)(int* size, realw* accel,long* Mesh_pointer_f) {} 
 
 void FC_FUNC_(transfer_accel_oc_from_device,
-              TRANSFER_ACCEL_OC_FROM_DEVICE)(int* size, realw* accel,long* Mesh_pointer_f) {}
+              TRANSFER_ACCEL_OC_FROM_DEVICE)(int* size, realw* accel,long* Mesh_pointer_f) {} 
 
 void FC_FUNC_(transfer_strain_cm_from_device,
               TRANSFER_STRAIN_CM_FROM_DEVICE)(long* Mesh_pointer,
@@ -664,7 +685,7 @@
                                                   realw* epsilondev_yy,
                                                   realw* epsilondev_xy,
                                                   realw* epsilondev_xz,
-                                                  realw* epsilondev_yz) {}
+                                                  realw* epsilondev_yz) {} 
 
 void FC_FUNC_(transfer_b_strain_cm_to_device,
               TRANSFER_B_STRAIN_CM_TO_DEVICE)(long* Mesh_pointer,
@@ -672,7 +693,7 @@
                                               realw* epsilondev_yy,
                                               realw* epsilondev_xy,
                                               realw* epsilondev_xz,
-                                              realw* epsilondev_yz) {}
+                                              realw* epsilondev_yz) {} 
 
 void FC_FUNC_(transfer_strain_ic_from_device,
               TRANSFER_STRAIN_IC_FROM_DEVICE)(long* Mesh_pointer,
@@ -681,7 +702,7 @@
                                               realw* epsilondev_yy,
                                               realw* epsilondev_xy,
                                               realw* epsilondev_xz,
-                                              realw* epsilondev_yz) {}
+                                              realw* epsilondev_yz) {} 
 
 void FC_FUNC_(transfer_b_strain_ic_to_device,
               TRANSFER_B_STRAIN_IC_TO_DEVICE)(long* Mesh_pointer,
@@ -689,7 +710,7 @@
                                               realw* epsilondev_yy,
                                               realw* epsilondev_xy,
                                               realw* epsilondev_xz,
-                                              realw* epsilondev_yz) {}
+                                              realw* epsilondev_yz) {} 
 
 void FC_FUNC_(transfer_rmemory_cm_from_device,
               TRANSFER_RMEMORY_CM_FROM_DEVICE)(long* Mesh_pointer,
@@ -697,7 +718,7 @@
                                                realw* R_yy,
                                                realw* R_xy,
                                                realw* R_xz,
-                                               realw* R_yz) {}
+                                               realw* R_yz) {} 
 
 void FC_FUNC_(transfer_b_rmemory_cm_to_device,
               TRANSFER_B_RMEMORY_CM_TO_DEVICE)(long* Mesh_pointer,
@@ -705,7 +726,7 @@
                                                realw* b_R_yy,
                                                realw* b_R_xy,
                                                realw* b_R_xz,
-                                               realw* b_R_yz) {}
+                                               realw* b_R_yz) {} 
 
 void FC_FUNC_(transfer_rmemory_ic_from_device,
               TRANSFER_RMEMORY_IC_FROM_DEVICE)(long* Mesh_pointer,
@@ -713,7 +734,7 @@
                                                realw* R_yy,
                                                realw* R_xy,
                                                realw* R_xz,
-                                               realw* R_yz) {}
+                                               realw* R_yz) {} 
 
 void FC_FUNC_(transfer_b_rmemory_ic_to_device,
               TRANSFER_B_RMEMORY_IC_TO_DEVICE)(long* Mesh_pointer,
@@ -721,17 +742,17 @@
                                                realw* b_R_yy,
                                                realw* b_R_xy,
                                                realw* b_R_xz,
-                                               realw* b_R_yz) {}
+                                               realw* b_R_yz) {} 
 
 void FC_FUNC_(transfer_rotation_from_device,
               TRANSFER_ROTATION_FROM_DEVICE)(long* Mesh_pointer,
                                              realw* A_array_rotation,
-                                             realw* B_array_rotation) {}
+                                             realw* B_array_rotation) {} 
 
 void FC_FUNC_(transfer_b_rotation_to_device,
               TRANSFER_B_ROTATION_TO_DEVICE)(long* Mesh_pointer,
                                               realw* A_array_rotation,
-                                              realw* B_array_rotation) {}
+                                              realw* B_array_rotation) {} 
 
 void FC_FUNC_(transfer_kernels_cm_to_host,
               TRANSFER_KERNELS_CM_TO_HOST)(long* Mesh_pointer,
@@ -739,30 +760,30 @@
                                            realw* h_alpha_kl,
                                            realw* h_beta_kl,
                                            realw* h_cijkl_kl,
-                                           int* NSPEC) {}
+                                           int* NSPEC) {} 
 
 void FC_FUNC_(transfer_kernels_ic_to_host,
               TRANSFER_KERNELS_IC_TO_HOST)(long* Mesh_pointer,
                                            realw* h_rho_kl,
                                            realw* h_alpha_kl,
                                            realw* h_beta_kl,
-                                           int* NSPEC) {}
+                                           int* NSPEC) {} 
 
 void FC_FUNC_(transfer_kernels_oc_to_host,
               TRANSFER_KERNELS_OC_TO_HOST)(long* Mesh_pointer,
                                            realw* h_rho_kl,
                                            realw* h_alpha_kl,
-                                           int* NSPEC) {}
+                                           int* NSPEC) {} 
 
 void FC_FUNC_(transfer_kernels_noise_to_host,
               TRANSFER_KERNELS_NOISE_TO_HOST)(long* Mesh_pointer,
                                               realw* h_Sigma_kl,
-                                              int* NSPEC) {}
+                                              int* NSPEC) {} 
 
 void FC_FUNC_(transfer_kernels_hess_cm_tohost,
               TRANSFER_KERNELS_HESS_CM_TOHOST)(long* Mesh_pointer,
                                               realw* h_hess_kl,
-                                              int* NSPEC) {}
+                                              int* NSPEC) {} 
 
 
 //
@@ -774,39 +795,39 @@
                                           realw* deltat_f,
                                           realw* deltatsqover2_f,
                                           realw* deltatover2_f,
-                                          int* FORWARD_OR_ADJOINT) {}
+                                          int* FORWARD_OR_ADJOINT) {} 
 
 void FC_FUNC_(update_displacement_cm_cuda,
               UPDATE_DISPLACMENT_CM_CUDA)(long* Mesh_pointer_f,
                                           realw* deltat_f,
                                           realw* deltatsqover2_f,
                                           realw* deltatover2_f,
-                                          int* FORWARD_OR_ADJOINT) {}
+                                          int* FORWARD_OR_ADJOINT) {} 
 
 void FC_FUNC_(update_displacement_oc_cuda,
               UPDATE_DISPLACEMENT_OC_cuda)(long* Mesh_pointer_f,
                                            realw* deltat_f,
                                            realw* deltatsqover2_f,
                                            realw* deltatover2_f,
-                                           int* FORWARD_OR_ADJOINT) {}
+                                           int* FORWARD_OR_ADJOINT) {} 
 
 void FC_FUNC_(multiply_accel_elastic_cuda,
               MULTIPLY_ACCEL_ELASTIC_CUDA)(long* Mesh_pointer,
-                                           int* FORWARD_OR_ADJOINT) {}
+                                           int* FORWARD_OR_ADJOINT) {} 
 
 void FC_FUNC_(update_veloc_elastic_cuda,
               UPDATE_VELOC_ELASTIC_CUDA)(long* Mesh_pointer,
                                          realw* deltatover2_f,
-                                         int* FORWARD_OR_ADJOINT) {}
+                                         int* FORWARD_OR_ADJOINT) {} 
 
 void FC_FUNC_(multiply_accel_acoustic_cuda,
               MULTIPLY_ACCEL_ACOUSTIC_CUDA)(long* Mesh_pointer,
-                                            int* FORWARD_OR_ADJOINT) {}
+                                            int* FORWARD_OR_ADJOINT) {} 
 
 void FC_FUNC_(update_veloc_acoustic_cuda,
               UPDATE_VELOC_ACOUSTIC_CUDA)(long* Mesh_pointer,
                                           realw* deltatover2_f,
-                                          int* FORWARD_OR_ADJOINT) {}
+                                          int* FORWARD_OR_ADJOINT) {} 
 
 
 //
@@ -826,5 +847,14 @@
                                                int* number_receiver_global,
                                                int* ispec_selected_rec,
                                                int* ispec_selected_source,
-                                               int* ibool) {}
+                                               int* ibool) {} 
 
+void FC_FUNC_(transfer_seismo_from_device_async,
+              TRANSFER_SEISMO_FROM_DEVICE_ASYNC)(long* Mesh_pointer_f,
+                                                 realw* displ,
+                                                 realw* b_displ,
+                                                 int* number_receiver_global,
+                                                 int* ispec_selected_rec,
+                                                 int* ispec_selected_source,
+                                                 int* ibool) {} 
+

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/update_displacement_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/update_displacement_cuda.cu	2014-01-06 21:22:48 UTC (rev 22990)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/update_displacement_cuda.cu	2014-01-13 16:06:21 UTC (rev 22991)
@@ -82,7 +82,7 @@
   int size = NDIM * mp->NGLOB_INNER_CORE;
 
   // debug
-#if DEBUG_BACKWARD_SIMULATIONS == 1
+#if DEBUG_BACKWARD_SIMULATIONS == 1 && DEBUG == 1
   realw max_d,max_v,max_a;
   max_d = get_device_array_maximum_value(mp->d_b_displ_inner_core, size);
   max_v = get_device_array_maximum_value(mp->d_b_veloc_inner_core, size);
@@ -107,7 +107,7 @@
 
   if( *FORWARD_OR_ADJOINT == 1 ){
     //launch kernel
-    UpdateDispVeloc_kernel<<<grid,threads>>>(mp->d_displ_inner_core,
+    UpdateDispVeloc_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_displ_inner_core,
                                              mp->d_veloc_inner_core,
                                              mp->d_accel_inner_core,
                                              size,deltat,deltatsqover2,deltatover2);
@@ -116,7 +116,7 @@
     DEBUG_BACKWARD_UPDATE();
 
     // kernel for backward fields
-    UpdateDispVeloc_kernel<<<grid,threads>>>(mp->d_b_displ_inner_core,
+    UpdateDispVeloc_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_displ_inner_core,
                                              mp->d_b_veloc_inner_core,
                                              mp->d_b_accel_inner_core,
                                              size,deltat,deltatsqover2,deltatover2);
@@ -149,7 +149,7 @@
   int size = NDIM * mp->NGLOB_CRUST_MANTLE;
 
   // debug
-#if DEBUG_BACKWARD_SIMULATIONS == 1
+#if DEBUG_BACKWARD_SIMULATIONS == 1 && DEBUG == 1
   realw max_d,max_v,max_a;
   max_d = get_device_array_maximum_value(mp->d_b_displ_crust_mantle, size);
   max_v = get_device_array_maximum_value(mp->d_b_veloc_crust_mantle, size);
@@ -174,7 +174,7 @@
 
   if( *FORWARD_OR_ADJOINT == 1 ){
     //launch kernel
-    UpdateDispVeloc_kernel<<<grid,threads>>>(mp->d_displ_crust_mantle,
+    UpdateDispVeloc_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_displ_crust_mantle,
                                              mp->d_veloc_crust_mantle,
                                              mp->d_accel_crust_mantle,
                                              size,deltat,deltatsqover2,deltatover2);
@@ -183,7 +183,7 @@
     DEBUG_BACKWARD_UPDATE();
 
     // kernel for backward fields
-    UpdateDispVeloc_kernel<<<grid,threads>>>(mp->d_b_displ_crust_mantle,
+    UpdateDispVeloc_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_displ_crust_mantle,
                                              mp->d_b_veloc_crust_mantle,
                                              mp->d_b_accel_crust_mantle,
                                              size,deltat,deltatsqover2,deltatover2);
@@ -247,7 +247,7 @@
   int size = mp->NGLOB_OUTER_CORE;
 
   // debug
-#if DEBUG_BACKWARD_SIMULATIONS == 1
+#if DEBUG_BACKWARD_SIMULATIONS == 1 && DEBUG == 1
   realw max_d,max_v,max_a;
   max_d = get_device_array_maximum_value(mp->d_b_displ_outer_core, size);
   max_v = get_device_array_maximum_value(mp->d_b_veloc_outer_core, size);
@@ -272,7 +272,7 @@
 
   if( *FORWARD_OR_ADJOINT == 1 ){
     //launch kernel
-    UpdatePotential_kernel<<<grid,threads>>>(mp->d_displ_outer_core,
+    UpdatePotential_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_displ_outer_core,
                                              mp->d_veloc_outer_core,
                                              mp->d_accel_outer_core,
                                              size,deltat,deltatsqover2,deltatover2);
@@ -280,7 +280,7 @@
     // debug
     DEBUG_BACKWARD_UPDATE();
 
-    UpdatePotential_kernel<<<grid,threads>>>(mp->d_b_displ_outer_core,
+    UpdatePotential_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_displ_outer_core,
                                              mp->d_b_veloc_outer_core,
                                              mp->d_b_accel_outer_core,
                                              size,deltat,deltatsqover2,deltatover2);
@@ -375,7 +375,7 @@
   threads = dim3(blocksize,1,1);
 
   if( *FORWARD_OR_ADJOINT == 1 ){
-    multiply_accel_elastic_cuda_device<<< grid, threads>>>(mp->d_accel_crust_mantle,
+    multiply_accel_elastic_cuda_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_accel_crust_mantle,
                                                            mp->d_veloc_crust_mantle,
                                                            mp->NGLOB_CRUST_MANTLE,
                                                            mp->two_omega_earth,
@@ -386,7 +386,7 @@
     // debug
     DEBUG_BACKWARD_UPDATE();
 
-    multiply_accel_elastic_cuda_device<<< grid, threads>>>(mp->d_b_accel_crust_mantle,
+    multiply_accel_elastic_cuda_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_accel_crust_mantle,
                                                            mp->d_b_veloc_crust_mantle,
                                                            mp->NGLOB_CRUST_MANTLE,
                                                            mp->b_two_omega_earth,
@@ -404,7 +404,7 @@
   threads = dim3(blocksize,1,1);
 
   if( *FORWARD_OR_ADJOINT == 1 ){
-    multiply_accel_elastic_cuda_device<<< grid, threads>>>(mp->d_accel_inner_core,
+    multiply_accel_elastic_cuda_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_accel_inner_core,
                                                            mp->d_veloc_inner_core,
                                                            mp->NGLOB_INNER_CORE,
                                                            mp->two_omega_earth,
@@ -415,7 +415,7 @@
     // debug
     DEBUG_BACKWARD_UPDATE();
 
-    multiply_accel_elastic_cuda_device<<< grid, threads>>>(mp->d_b_accel_inner_core,
+    multiply_accel_elastic_cuda_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_accel_inner_core,
                                                            mp->d_b_veloc_inner_core,
                                                            mp->NGLOB_INNER_CORE,
                                                            mp->b_two_omega_earth,
@@ -480,12 +480,12 @@
   threads = dim3(blocksize,1,1);
 
   if( *FORWARD_OR_ADJOINT == 1 ){
-    update_veloc_elastic_cuda_device<<< grid, threads>>>(mp->d_veloc_crust_mantle,
+    update_veloc_elastic_cuda_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_veloc_crust_mantle,
                                                          mp->d_accel_crust_mantle,
                                                          mp->NGLOB_CRUST_MANTLE,
                                                          deltatover2);
   }else if( *FORWARD_OR_ADJOINT == 3 ){
-    update_veloc_elastic_cuda_device<<< grid, threads>>>(mp->d_b_veloc_crust_mantle,
+    update_veloc_elastic_cuda_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_veloc_crust_mantle,
                                                          mp->d_b_accel_crust_mantle,
                                                          mp->NGLOB_CRUST_MANTLE,
                                                          deltatover2);
@@ -500,12 +500,12 @@
   threads = dim3(blocksize,1,1);
 
   if( *FORWARD_OR_ADJOINT == 1 ){
-    update_veloc_elastic_cuda_device<<< grid, threads>>>(mp->d_veloc_inner_core,
+    update_veloc_elastic_cuda_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_veloc_inner_core,
                                                          mp->d_accel_inner_core,
                                                          mp->NGLOB_INNER_CORE,
                                                          deltatover2);
   }else if( *FORWARD_OR_ADJOINT == 3 ){
-    update_veloc_elastic_cuda_device<<< grid, threads>>>(mp->d_b_veloc_inner_core,
+    update_veloc_elastic_cuda_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_veloc_inner_core,
                                                          mp->d_b_accel_inner_core,
                                                          mp->NGLOB_INNER_CORE,
                                                          deltatover2);
@@ -562,14 +562,14 @@
 
   // multiplies accel with inverse of mass matrix
   if( *FORWARD_OR_ADJOINT == 1 ){
-    multiply_accel_acoustic_cuda_device<<< grid, threads>>>(mp->d_accel_outer_core,
+    multiply_accel_acoustic_cuda_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_accel_outer_core,
                                                             mp->NGLOB_OUTER_CORE,
                                                             mp->d_rmass_outer_core);
   }else if( *FORWARD_OR_ADJOINT == 3 ){
     // debug
     DEBUG_BACKWARD_UPDATE();
 
-    multiply_accel_acoustic_cuda_device<<< grid, threads>>>(mp->d_b_accel_outer_core,
+    multiply_accel_acoustic_cuda_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_accel_outer_core,
                                                             mp->NGLOB_OUTER_CORE,
                                                             mp->d_b_rmass_outer_core);
   }
@@ -624,7 +624,7 @@
 
   // updates velocity
   if( *FORWARD_OR_ADJOINT == 1 ){
-    update_veloc_acoustic_cuda_device<<< grid, threads>>>(mp->d_veloc_outer_core,
+    update_veloc_acoustic_cuda_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_veloc_outer_core,
                                                           mp->d_accel_outer_core,
                                                           mp->NGLOB_OUTER_CORE,
                                                           deltatover2);
@@ -632,7 +632,7 @@
     // debug
     DEBUG_BACKWARD_UPDATE();
 
-    update_veloc_acoustic_cuda_device<<< grid, threads>>>(mp->d_b_veloc_outer_core,
+    update_veloc_acoustic_cuda_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_veloc_outer_core,
                                                           mp->d_b_accel_outer_core,
                                                           mp->NGLOB_OUTER_CORE,
                                                           deltatover2);

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/write_seismograms_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/write_seismograms_cuda.cu	2014-01-06 21:22:48 UTC (rev 22990)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/write_seismograms_cuda.cu	2014-01-13 16:06:21 UTC (rev 22991)
@@ -47,53 +47,61 @@
                                                             int* ispec_selected_rec,
                                                             int* ibool,
                                                             realw* station_seismo_field,
-                                                            realw* desired_field,
+                                                            realw* d_field,
                                                             int nrec_local) {
 
 // vector fields
 
   int blockID = blockIdx.x + blockIdx.y*gridDim.x;
+  int tx = threadIdx.x;
 
+  int irec,ispec,iglob;
+
   if(blockID < nrec_local) {
-    int irec = number_receiver_global[blockID]-1;
-    int ispec = ispec_selected_rec[irec]-1;
-    int iglob = ibool[threadIdx.x + NGLL3*ispec]-1;
+    irec = number_receiver_global[blockID]-1;
+    ispec = ispec_selected_rec[irec]-1;
+    iglob = ibool[tx + NGLL3*ispec]-1;
 
-    station_seismo_field[3*NGLL3*blockID + 3*threadIdx.x+0] = desired_field[3*iglob];
-    station_seismo_field[3*NGLL3*blockID + 3*threadIdx.x+1] = desired_field[3*iglob+1];
-    station_seismo_field[3*NGLL3*blockID + 3*threadIdx.x+2] = desired_field[3*iglob+2];
+    station_seismo_field[3*NGLL3*blockID + 3*tx+0] = d_field[3*iglob];
+    station_seismo_field[3*NGLL3*blockID + 3*tx+1] = d_field[3*iglob+1];
+    station_seismo_field[3*NGLL3*blockID + 3*tx+2] = d_field[3*iglob+2];
   }
 }
 
 /* ----------------------------------------------------------------------------------------------- */
 
-__global__ void write_seismograms_transfer_scalar_from_device_kernel(int* number_receiver_global,
+__global__ void write_seismograms_transfer_strain_from_device_kernel(int* number_receiver_global,
                                                                      int* ispec_selected_rec,
                                                                      int* ibool,
-                                                                     realw* station_seismo_field,
-                                                                     realw* desired_field,
+                                                                     realw* station_strain_field,
+                                                                     realw* d_field,
                                                                      int nrec_local) {
 
 // scalar fields
 
   int blockID = blockIdx.x + blockIdx.y*gridDim.x;
+  int tx = threadIdx.x;
 
+  int irec,ispec,iglob;
+
   if(blockID < nrec_local) {
-    int irec = number_receiver_global[blockID]-1;
-    int ispec = ispec_selected_rec[irec]-1;
-    int iglob = ibool[threadIdx.x + NGLL3*ispec]-1;
+    irec = number_receiver_global[blockID]-1;
+    ispec = ispec_selected_rec[irec]-1;
+    iglob = ibool[tx + NGLL3*ispec]-1;
 
-    station_seismo_field[NGLL3*blockID + threadIdx.x] = desired_field[iglob];
+    station_strain_field[NGLL3*blockID + tx] = d_field[iglob];
   }
 }
 
 /* ----------------------------------------------------------------------------------------------- */
 
-void write_seismograms_transfer_from_device(Mesh* mp, realw* d_field,realw* h_field,
-                                          int* number_receiver_global,
-                                          int* d_ispec_selected,
-                                          int* h_ispec_selected,
-                                          int* ibool) {
+void write_seismograms_transfer_from_device(Mesh* mp,
+                                            realw* d_field,
+                                            realw* h_field,
+                                            int* number_receiver_global,
+                                            int* d_ispec_selected,
+                                            int* h_ispec_selected,
+                                            int* ibool) {
 
 TRACE("write_seismograms_transfer_from_device");
 
@@ -111,8 +119,13 @@
   dim3 grid(num_blocks_x,num_blocks_y);
   dim3 threads(blocksize,1,1);
 
+  // waits for previous copy call to be finished
+  if( GPU_ASYNC_COPY ){
+    cudaStreamSynchronize(mp->copy_stream);
+  }
+
   // prepare field transfer array on device
-  write_seismograms_transfer_from_device_kernel<<<grid,threads>>>(mp->d_number_receiver_global,
+  write_seismograms_transfer_from_device_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_number_receiver_global,
                                                                   d_ispec_selected,
                                                                   mp->d_ibool_crust_mantle,
                                                                   mp->d_station_seismo_field,
@@ -120,18 +133,31 @@
                                                                   mp->nrec_local);
 
   // copies array to CPU
-  cudaMemcpy(mp->h_station_seismo_field,mp->d_station_seismo_field,
-             3*NGLL3*(mp->nrec_local)*sizeof(realw),cudaMemcpyDeviceToHost);
+  if( GPU_ASYNC_COPY ){
+    // waits for previous compute call to be finished
+    cudaStreamSynchronize(mp->compute_stream);
 
-  for(irec_local = 0 ; irec_local < mp->nrec_local; irec_local++) {
-    irec = number_receiver_global[irec_local] - 1;
-    ispec = h_ispec_selected[irec] - 1;
+    // asynchronuous copy
+    // note: we need to update the host array in a subsequent call to transfer_seismo_from_device_async() routine
+    cudaMemcpyAsync(mp->h_station_seismo_field,mp->d_station_seismo_field,
+                    3*NGLL3*(mp->nrec_local)*sizeof(realw),
+                    cudaMemcpyDeviceToHost,mp->copy_stream);
+  }else{
+    // synchronuous copy
+    print_CUDA_error_if_any(cudaMemcpy(mp->h_station_seismo_field,mp->d_station_seismo_field,
+                                       3*NGLL3*(mp->nrec_local)*sizeof(realw),cudaMemcpyDeviceToHost),77000);
 
-    for(i = 0; i < NGLL3; i++) {
-      iglob = ibool[i+NGLL3*ispec] - 1;
-      h_field[0+3*iglob] = mp->h_station_seismo_field[0+3*i+irec_local*NGLL3*3];
-      h_field[1+3*iglob] = mp->h_station_seismo_field[1+3*i+irec_local*NGLL3*3];
-      h_field[2+3*iglob] = mp->h_station_seismo_field[2+3*i+irec_local*NGLL3*3];
+    // updates array on CPU
+    for(irec_local = 0 ; irec_local < mp->nrec_local; irec_local++) {
+      irec = number_receiver_global[irec_local] - 1;
+      ispec = h_ispec_selected[irec] - 1;
+
+      for(i = 0; i < NGLL3; i++) {
+        iglob = ibool[i+NGLL3*ispec] - 1;
+        h_field[0+3*iglob] = mp->h_station_seismo_field[0+3*i+irec_local*NGLL3*3];
+        h_field[1+3*iglob] = mp->h_station_seismo_field[1+3*i+irec_local*NGLL3*3];
+        h_field[2+3*iglob] = mp->h_station_seismo_field[2+3*i+irec_local*NGLL3*3];
+      }
     }
   }
 
@@ -142,14 +168,15 @@
 
 /* ----------------------------------------------------------------------------------------------- */
 
-void write_seismograms_transfer_scalar_from_device(Mesh* mp,
-                                                   realw* d_field,realw* h_field,
+void write_seismograms_transfer_strain_from_device(Mesh* mp,
+                                                   realw* d_field,
+                                                   realw* h_field,
                                                    int* number_receiver_global,
                                                    int* d_ispec_selected,
                                                    int* h_ispec_selected,
                                                    int* ibool) {
 
-  TRACE("write_seismograms_transfer_scalar_from_device");
+  TRACE("write_seismograms_transfer_strain_from_device");
 
   int irec_local,irec;
   int ispec,iglob,i;
@@ -166,28 +193,30 @@
   dim3 threads(blocksize,1,1);
 
   // prepare field transfer array on device
-  write_seismograms_transfer_scalar_from_device_kernel<<<grid,threads>>>(mp->d_number_receiver_global,
+  write_seismograms_transfer_strain_from_device_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_number_receiver_global,
                                                                          d_ispec_selected,
                                                                          mp->d_ibool_crust_mantle,
-                                                                         mp->d_station_seismo_field,
+                                                                         mp->d_station_strain_field,
                                                                          d_field,
                                                                          mp->nrec_local);
 
   // copies array to CPU
-  cudaMemcpy(mp->h_station_seismo_field,mp->d_station_seismo_field,
-             NGLL3*(mp->nrec_local)*sizeof(realw),cudaMemcpyDeviceToHost);
+  // synchronuous copy
+  print_CUDA_error_if_any(cudaMemcpy(mp->h_station_strain_field,mp->d_station_strain_field,
+                                     NGLL3*(mp->nrec_local)*sizeof(realw),cudaMemcpyDeviceToHost),77001);
 
+  // updates host array
   for(irec_local = 0 ; irec_local < mp->nrec_local; irec_local++) {
     irec = number_receiver_global[irec_local] - 1;
     ispec = h_ispec_selected[irec] - 1;
     for(i = 0; i < NGLL3; i++) {
       iglob = ibool[i+NGLL3*ispec] - 1;
-      h_field[iglob] = mp->h_station_seismo_field[i+irec_local*NGLL3];
+      h_field[iglob] = mp->h_station_strain_field[i+irec_local*NGLL3];
     }
   }
 
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-  exit_on_cuda_error("write_seismograms_transfer_scalar_from_device");
+  exit_on_cuda_error("write_seismograms_transfer_strain_from_device");
 #endif
 }
 
@@ -208,15 +237,17 @@
                                                int* ispec_selected_rec,
                                                int* ispec_selected_source,
                                                int* ibool) {
-TRACE("write_seismograms_transfer_cuda");
+  TRACE("write_seismograms_transfer_cuda");
 
   Mesh* mp = (Mesh*)(*Mesh_pointer_f); // get Mesh from fortran integer wrapper
 
   // checks if anything to do
   if( mp->nrec_local == 0 ) return;
 
+  // transfers displacement values in receiver elements from GPU to CPU
   switch( mp->simulation_type ){
     case 1:
+      // forward simulation
       write_seismograms_transfer_from_device(mp,mp->d_displ_crust_mantle,
                                              displ,
                                              number_receiver_global,
@@ -225,50 +256,130 @@
       break;
 
     case 2:
+      // adjoint simulation
       write_seismograms_transfer_from_device(mp,mp->d_displ_crust_mantle,
                                              displ,
                                              number_receiver_global,
                                              mp->d_ispec_selected_source,
                                              ispec_selected_source, ibool);
 
-      write_seismograms_transfer_scalar_from_device(mp,mp->d_eps_trace_over_3_crust_mantle,
+      // strain
+      write_seismograms_transfer_strain_from_device(mp,mp->d_eps_trace_over_3_crust_mantle,
                                                     eps_trace_over_3,
                                                     number_receiver_global,
                                                     mp->d_ispec_selected_source,
                                                     ispec_selected_source, ibool);
-      write_seismograms_transfer_scalar_from_device(mp,mp->d_epsilondev_xx_crust_mantle,
+      write_seismograms_transfer_strain_from_device(mp,mp->d_epsilondev_xx_crust_mantle,
                                                     epsilondev_xx,
                                                     number_receiver_global,
                                                     mp->d_ispec_selected_source,
                                                     ispec_selected_source, ibool);
-      write_seismograms_transfer_scalar_from_device(mp,mp->d_epsilondev_yy_crust_mantle,
+      write_seismograms_transfer_strain_from_device(mp,mp->d_epsilondev_yy_crust_mantle,
                                                     epsilondev_yy,
                                                     number_receiver_global,
                                                     mp->d_ispec_selected_source,
                                                     ispec_selected_source, ibool);
-      write_seismograms_transfer_scalar_from_device(mp,mp->d_epsilondev_xy_crust_mantle,
+      write_seismograms_transfer_strain_from_device(mp,mp->d_epsilondev_xy_crust_mantle,
                                                     epsilondev_xy,
                                                     number_receiver_global,
                                                     mp->d_ispec_selected_source,
                                                     ispec_selected_source, ibool);
-      write_seismograms_transfer_scalar_from_device(mp,mp->d_epsilondev_xz_crust_mantle,
+      write_seismograms_transfer_strain_from_device(mp,mp->d_epsilondev_xz_crust_mantle,
                                                     epsilondev_xz,
                                                     number_receiver_global,
                                                     mp->d_ispec_selected_source,
                                                     ispec_selected_source, ibool);
-      write_seismograms_transfer_scalar_from_device(mp,mp->d_epsilondev_yz_crust_mantle,
+      write_seismograms_transfer_strain_from_device(mp,mp->d_epsilondev_yz_crust_mantle,
                                                     epsilondev_yz,
                                                     number_receiver_global,
                                                     mp->d_ispec_selected_source,
                                                     ispec_selected_source, ibool);
+
       break;
 
     case 3:
-      write_seismograms_transfer_from_device(mp,mp->d_b_displ_crust_mantle,b_displ,
-                                                 number_receiver_global,
-                                                 mp->d_ispec_selected_rec,
-                                                 ispec_selected_rec, ibool);
+      // kernel simulation
+      write_seismograms_transfer_from_device(mp,mp->d_b_displ_crust_mantle,
+                                             b_displ,
+                                             number_receiver_global,
+                                             mp->d_ispec_selected_rec,
+                                             ispec_selected_rec, ibool);
       break;
   }
+
 }
 
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// data transfer to CPU host
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(transfer_seismo_from_device_async,
+              TRANSFER_SEISMO_FROM_DEVICE_ASYNC)(long* Mesh_pointer_f,
+                                                 realw* displ,
+                                                 realw* b_displ,
+                                                 int* number_receiver_global,
+                                                 int* ispec_selected_rec,
+                                                 int* ispec_selected_source,
+                                                 int* ibool) {
+
+  TRACE("transfer_seismo_from_device_async");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); // get Mesh from fortran integer wrapper
+
+  int irec,ispec,iglob,i;
+  realw* h_field;
+  int* h_ispec_selected;
+
+  // checks if anything to do
+  if( mp->nrec_local == 0 ) return;
+
+  // checks async-memcpy
+  if( GPU_ASYNC_COPY == 0 ){
+    exit_on_error("transfer_seismo_from_device_async must be called with GPU_ASYNC_COPY == 1, please check mesh_constants_cuda.h");
+  }
+
+  // waits for previous copy call to be finished
+  cudaStreamSynchronize(mp->copy_stream);
+
+  // transfers displacements
+  // select target array on host
+  switch( mp->simulation_type ){
+    case 1:
+      // forward simulation
+      h_field = displ;
+      h_ispec_selected = ispec_selected_rec;
+      break;
+
+    case 2:
+      // adjoint simulation
+      h_field = displ;
+      h_ispec_selected = ispec_selected_source;
+      break;
+
+    case 3:
+      // kernel simulation
+      h_field = b_displ;
+      h_ispec_selected = ispec_selected_rec;
+      break;
+  }
+
+  // updates corresponding array on CPU
+  for(int irec_local = 0 ; irec_local < mp->nrec_local; irec_local++) {
+    irec = number_receiver_global[irec_local] - 1;
+    ispec = h_ispec_selected[irec] - 1;
+
+    for(i = 0; i < NGLL3; i++) {
+      iglob = ibool[i+NGLL3*ispec] - 1;
+      h_field[0+3*iglob] = mp->h_station_seismo_field[0+3*i+irec_local*NGLL3*3];
+      h_field[1+3*iglob] = mp->h_station_seismo_field[1+3*i+irec_local*NGLL3*3];
+      h_field[2+3*iglob] = mp->h_station_seismo_field[2+3*i+irec_local*NGLL3*3];
+    }
+  }
+
+}
+
+

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/meshfem3D/model_crust.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/meshfem3D/model_crust.f90	2014-01-06 21:22:48 UTC (rev 22990)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/meshfem3D/model_crust.f90	2014-01-13 16:06:21 UTC (rev 22991)
@@ -67,10 +67,10 @@
   integer :: ier
 
   ! allocate crustal arrays
-  allocate( thlr(NKEYS_CRUST,NLAYERS_CRUST), &
-           velocp(NKEYS_CRUST,NLAYERS_CRUST), &
-           velocs(NKEYS_CRUST,NLAYERS_CRUST), &
-           dens(NKEYS_CRUST,NLAYERS_CRUST), &
+  allocate(thlr(NLAYERS_CRUST,NKEYS_CRUST), &
+           velocp(NLAYERS_CRUST,NKEYS_CRUST), &
+           velocs(NLAYERS_CRUST,NKEYS_CRUST), &
+           dens(NLAYERS_CRUST,NKEYS_CRUST), &
            stat=ier)
   if( ier /= 0 ) call exit_MPI(myrank,'error allocating crustal arrays')
 
@@ -184,7 +184,9 @@
 
  end subroutine model_crust
 
-!---------------------------
+!
+!-------------------------------------------------------------------------------------------------
+!
 
   subroutine read_crust_model()
 
@@ -226,12 +228,14 @@
 
   do ikey=1,NKEYS_CRUST
     read (1,"(a2)") code(ikey)
-    read (1,*) (velocp(ikey,i),i=1,NLAYERS_CRUST)
-    read (1,*) (velocs(ikey,i),i=1,NLAYERS_CRUST)
-    read (1,*) (dens(ikey,i),i=1,NLAYERS_CRUST)
-    read (1,*) (thlr(ikey,i),i=1,NLAYERS_CRUST-1),thlr(ikey,NLAYERS_CRUST)
-    if(thlr(ikey,NLAYERS_CRUST) > h_moho_max) h_moho_max = thlr(ikey,NLAYERS_CRUST)
-    if(thlr(ikey,NLAYERS_CRUST) < h_moho_min) h_moho_min = thlr(ikey,NLAYERS_CRUST)
+    read (1,*) (velocp(i,ikey),i=1,NLAYERS_CRUST)
+    read (1,*) (velocs(i,ikey),i=1,NLAYERS_CRUST)
+    read (1,*) (dens(i,ikey),i=1,NLAYERS_CRUST)
+    read (1,*) (thlr(i,ikey),i=1,NLAYERS_CRUST-1),thlr(NLAYERS_CRUST,ikey)
+
+    ! limit moho thickness
+    if(thlr(NLAYERS_CRUST,ikey) > h_moho_max) h_moho_max = thlr(NLAYERS_CRUST,ikey)
+    if(thlr(NLAYERS_CRUST,ikey) < h_moho_min) h_moho_min = thlr(NLAYERS_CRUST,ikey)
   enddo
   close(1)
 
@@ -240,7 +244,9 @@
 
   end subroutine read_crust_model
 
-!---------------------------
+!
+!-------------------------------------------------------------------------------------------------
+!
 
   subroutine crust_CAPsmoothed(lat,lon,velp,vels,rho,thick,abbreviation,&
                               code,thlr,velocp,velocs,dens)
@@ -263,7 +269,7 @@
   ! argument variables
   double precision :: lat,lon
   double precision,dimension(NLAYERS_CRUST) :: rho,thick,velp,vels
-  double precision,dimension(NKEYS_CRUST,NLAYERS_CRUST) :: thlr,velocp,velocs,dens
+  double precision,dimension(NLAYERS_CRUST,NKEYS_CRUST) :: thlr,velocp,velocs,dens
 
   character(len=2) :: code(NKEYS_CRUST)
   character(len=2) :: abbreviation(NCAP_CRUST/2,NCAP_CRUST)
@@ -280,13 +286,15 @@
   !-------------------------------
 
   ! local variables
-  double precision xlon(NTHETA*NPHI),xlat(NTHETA*NPHI),weight(NTHETA*NPHI)
-  double precision rhol(NLAYERS_CRUST),thickl(NLAYERS_CRUST),velpl(NLAYERS_CRUST),velsl(NLAYERS_CRUST)
-  double precision weightl,cap_degree,dist
-  double precision h_sed
-  integer i,icolat,ilon,ierr
-  character(len=2) crustaltype
+  double precision :: xlon(NTHETA*NPHI),xlat(NTHETA*NPHI),weight(NTHETA*NPHI)
+  double precision :: rhol(NLAYERS_CRUST),thickl(NLAYERS_CRUST),velpl(NLAYERS_CRUST),velsl(NLAYERS_CRUST)
 
+  double precision :: weightl,cap_degree,dist
+  double precision :: h_sed
+  integer :: i,icolat,ilon
+  character(len=2) :: crustaltype
+
+
   ! checks latitude/longitude
   if(lat > 90.0d0 .or. lat < -90.0d0 .or. lon > 180.0d0 .or. lon < -180.0d0) &
     stop 'error in latitude/longitude range in crust'
@@ -328,10 +336,11 @@
   do i=1,NTHETA*NPHI
     ! gets crust values
     call icolat_ilon(xlat(i),xlon(i),icolat,ilon)
+
     crustaltype = abbreviation(icolat,ilon)
+
     call get_crust_structure(crustaltype,velpl,velsl,rhol,thickl, &
-                            code,thlr,velocp,velocs,dens,ierr)
-    if(ierr /= 0) stop 'error in routine get_crust_structure'
+                             code,thlr,velocp,velocs,dens)
 
     ! sediment thickness
     h_sed = thickl(3) + thickl(4)
@@ -359,7 +368,9 @@
   end subroutine crust_CAPsmoothed
 
 
-!------------------------------------------------------
+!
+!-------------------------------------------------------------------------------------------------
+!
 
   subroutine icolat_ilon(xlat,xlon,icolat,ilon)
 
@@ -372,57 +383,65 @@
 
   if(xlat > 90.0d0 .or. xlat < -90.0d0 .or. xlon > 180.0d0 .or. xlon < -180.0d0) &
     stop 'error in latitude/longitude range in icolat_ilon'
-  icolat=int(1+( (90.d0-xlat)*0.5d0 ))
-  if(icolat == 91) icolat=90
-  ilon=int(1+( (180.d0+xlon)*0.5d0 ))
-  if(ilon == 181) ilon=1
 
+  icolat = int(1+( (90.d0-xlat)*0.5d0 ))
+  if(icolat == 91) icolat = 90
+
+  ilon = int(1+( (180.d0+xlon)*0.5d0 ))
+  if(ilon == 181) ilon = 1
+
   if(icolat>90 .or. icolat<1) stop 'error in routine icolat_ilon'
   if(ilon<1 .or. ilon>180) stop 'error in routine icolat_ilon'
 
   end subroutine icolat_ilon
 
-!---------------------------------------------------------------------
+!
+!-------------------------------------------------------------------------------------------------
+!
 
-  subroutine get_crust_structure(type,vptyp,vstyp,rhtyp,thtp, &
-               code,thlr,velocp,velocs,dens,ierr)
+  subroutine get_crust_structure(crustaltype,vptyp,vstyp,rhtyp,thtp, &
+               code,thlr,velocp,velocs,dens)
 
   use model_crust_par,only: NLAYERS_CRUST,NKEYS_CRUST
 
   implicit none
 
   ! argument variables
-  integer ierr
-  double precision rhtyp(NLAYERS_CRUST),thtp(NLAYERS_CRUST)
-  double precision vptyp(NLAYERS_CRUST),vstyp(NLAYERS_CRUST)
-  character(len=2) type,code(NKEYS_CRUST)
-  double precision thlr(NKEYS_CRUST,NLAYERS_CRUST),velocp(NKEYS_CRUST,NLAYERS_CRUST)
-  double precision velocs(NKEYS_CRUST,NLAYERS_CRUST),dens(NKEYS_CRUST,NLAYERS_CRUST)
+  character(len=2) :: crustaltype
+  double precision :: rhtyp(NLAYERS_CRUST),thtp(NLAYERS_CRUST)
+  double precision :: vptyp(NLAYERS_CRUST),vstyp(NLAYERS_CRUST)
+  character(len=2) :: code(NKEYS_CRUST)
+  double precision :: thlr(NLAYERS_CRUST,NKEYS_CRUST),velocp(NLAYERS_CRUST,NKEYS_CRUST)
+  double precision :: velocs(NLAYERS_CRUST,NKEYS_CRUST),dens(NLAYERS_CRUST,NKEYS_CRUST)
 
   ! local variables
-  integer i,ikey
+  integer :: ikey,ikey_selected
 
-  ierr=1
+  ! determines layer index
+  ikey_selected = 0
   do ikey=1,NKEYS_CRUST
-    if (code(ikey) == type) then
-      do i=1,NLAYERS_CRUST
-        vptyp(i)=velocp(ikey,i)
-        vstyp(i)=velocs(ikey,i)
-        rhtyp(i)=dens(ikey,i)
-      enddo
-      do i=1,NLAYERS_CRUST-1
-        thtp(i)=thlr(ikey,i)
-      enddo
-      !   get distance to Moho from the bottom of the ocean or the ice
-      thtp(NLAYERS_CRUST)=thlr(ikey,NLAYERS_CRUST)-thtp(1)-thtp(2)
-      ierr=0
+    if (code(ikey) == crustaltype) then
+      ikey_selected = ikey
+      exit
     endif
   enddo
+  if( ikey_selected == 0 ) stop 'error in get_crust_structure: could not find ikey for selected crustal type'
 
+  ! sets vp,vs and rho for all layers
+  vptyp(:) = velocp(:,ikey_selected)
+  vstyp(:) = velocs(:,ikey_selected)
+  rhtyp(:) = dens(:,ikey_selected)
+  thtp(:) = thlr(:,ikey_selected)
+
+  !   get distance to Moho from the bottom of the ocean or the ice
+  thtp(NLAYERS_CRUST) = thtp(NLAYERS_CRUST) - thtp(1) - thtp(2)
+
   end subroutine get_crust_structure
 
 
-!---------------------------
+!
+!-------------------------------------------------------------------------------------------------
+!
 
   subroutine CAP_vardegree(lon,lat,xlon,xlat,weight,CAP_DEGREE,NTHETA,NPHI)
 

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/meshfem3D/rules.mk
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/meshfem3D/rules.mk	2014-01-06 21:22:48 UTC (rev 22990)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/meshfem3D/rules.mk	2014-01-13 16:06:21 UTC (rev 22991)
@@ -269,9 +269,9 @@
 $O/%.check_adios_module.o: $S/%.F90 $O/shared_par.shared_module.o $O/meshfem3D_par.check_module.o $O/adios_helpers.shared_adios.o
 	${FCCOMPILE_CHECK} ${FCFLAGS_f90} -c -o $@ $<
 
-$O/%.check_adios.o: $S/%.f90 $O/shared_par.shared_module.o $O/meshfem3D_par.check_module.o $O/adios_helpers.shared_adios.o $O/write_AVS_DX_global_data_adios.check_adios_module.o
+$O/%.check_adios.o: $S/%.f90 $O/shared_par.shared_module.o $O/meshfem3D_par.check_module.o $O/adios_helpers.shared_adios.o $O/write_AVS_DX_global_data_adios.check_adios_module.o $O/write_AVS_DX_surface_data_adios.check_adios_module.o
 	${FCCOMPILE_CHECK} ${FCFLAGS_f90} -c -o $@ $<
 
-$O/%.check_adios.o: $S/%.F90 $O/shared_par.shared_module.o $O/meshfem3D_par.check_module.o $O/adios_helpers.shared_adios.o $O/write_AVS_DX_global_data_adios.check_adios_module.o
+$O/%.check_adios.o: $S/%.F90 $O/shared_par.shared_module.o $O/meshfem3D_par.check_module.o $O/adios_helpers.shared_adios.o $O/write_AVS_DX_global_data_adios.check_adios_module.o $O/write_AVS_DX_surface_data_adios.check_adios_module.o
 	${FCCOMPILE_CHECK} ${FCFLAGS_f90} -c -o $@ $<
 

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/shared/write_VTK_file.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/shared/write_VTK_file.f90	2014-01-06 21:22:48 UTC (rev 22990)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/shared/write_VTK_file.f90	2014-01-13 16:06:21 UTC (rev 22991)
@@ -193,7 +193,7 @@
 
   ! type: hexahedrons
   write(IOVTK,'(a,i12)') "CELL_TYPES ",nspec
-  write(IOVTK,*) (12,ispec=1,nspec)
+  write(IOVTK,'(6i12)') (12,ispec=1,nspec)
   write(IOVTK,*) ""
 
   write(IOVTK,'(a,i12)') "CELL_DATA ",nspec
@@ -268,7 +268,7 @@
 
   ! type: hexahedrons
   write(IOVTK,'(a,i12)') "CELL_TYPES ",nspec
-  write(IOVTK,*) (12,ispec=1,nspec)
+  write(IOVTK,'(6i12)') (12,ispec=1,nspec)
   write(IOVTK,*) ""
 
   write(IOVTK,'(a,i12)') "CELL_DATA ",nspec
@@ -371,7 +371,7 @@
 
   ! type: hexahedrons
   write(IOVTK,'(a,i12)') "CELL_TYPES ",nspec
-  write(IOVTK,*) (12,ispec=1,nspec)
+  write(IOVTK,'(6i12)') (12,ispec=1,nspec)
   write(IOVTK,*) ""
 
   ! x components
@@ -561,7 +561,7 @@
 
     ! type: hexahedrons
     write(IOVTK,'(a,i12)') "CELL_TYPES ",nspec*NPROCTOT
-    write(IOVTK,*) (12,ispec=1,nspec*NPROCTOT)
+    write(IOVTK,'(6i12)') (12,ispec=1,nspec*NPROCTOT)
     write(IOVTK,*) ""
 
     ! x components

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/assemble_MPI_scalar.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/assemble_MPI_scalar.f90	2014-01-06 21:22:48 UTC (rev 22990)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/assemble_MPI_scalar.f90	2014-01-13 16:06:21 UTC (rev 22991)
@@ -238,17 +238,20 @@
 
   end subroutine assemble_MPI_scalar_w
 
+
+!-------------------------------------------------------------------------------------------------
 !
+! CUDA routines
+!
 !-------------------------------------------------------------------------------------------------
-!
 
+
   subroutine assemble_MPI_scalar_send_cuda(Mesh_pointer,NPROC, &
                                           buffer_send_scalar,buffer_recv_scalar, &
                                           num_interfaces,max_nibool_interfaces, &
                                           nibool_interfaces, &
                                           my_neighbours, &
-                                          request_send_scalar,request_recv_scalar, &
-                                          FORWARD_OR_ADJOINT)
+                                          request_send_scalar,request_recv_scalar)
 
 ! non-blocking MPI send
 
@@ -270,20 +273,15 @@
   integer, dimension(num_interfaces) :: nibool_interfaces,my_neighbours
   integer, dimension(num_interfaces) :: request_send_scalar,request_recv_scalar
 
-  integer :: FORWARD_OR_ADJOINT
-
   ! local parameters
   integer iinterface
 
-! sends only if more than one partition
+  ! note: preparation of the contribution between partitions using MPI
+  !       transfers mpi buffers to CPU in already done in transfer_boun_pot_from_device() call
+
+  ! sends only if more than one partition
   if(NPROC > 1) then
 
-    ! preparation of the contribution between partitions using MPI
-    ! transfers mpi buffers to CPU
-    call transfer_boun_pot_from_device(Mesh_pointer, &
-                                      buffer_send_scalar, &
-                                      FORWARD_OR_ADJOINT)
-
     ! send messages
     do iinterface = 1, num_interfaces
       ! non-blocking synchronous send request
@@ -323,8 +321,7 @@
 
   integer :: num_interfaces,max_nibool_interfaces
 
-  real(kind=CUSTOM_REAL), dimension(max_nibool_interfaces,num_interfaces) :: &
-       buffer_recv_scalar
+  real(kind=CUSTOM_REAL), dimension(max_nibool_interfaces,num_interfaces) :: buffer_recv_scalar
   integer, dimension(num_interfaces) :: request_send_scalar,request_recv_scalar
 
   integer :: FORWARD_OR_ADJOINT
@@ -360,3 +357,64 @@
   endif
 
   end subroutine assemble_MPI_scalar_write_cuda
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+! with cuda functions...
+
+  subroutine transfer_boundarypot_to_device(Mesh_pointer, NPROC, &
+                                            buffer_recv_scalar, &
+                                            num_interfaces,max_nibool_interfaces,&
+                                            request_recv_scalar, &
+                                            IREGION,FORWARD_OR_ADJOINT)
+
+  use constants
+
+  implicit none
+
+  integer(kind=8) :: Mesh_pointer
+
+  integer :: NPROC
+
+  ! array to assemble
+  integer :: num_interfaces,max_nibool_interfaces
+
+  real(kind=CUSTOM_REAL), dimension(max_nibool_interfaces,num_interfaces) :: buffer_recv_scalar
+
+  integer, dimension(num_interfaces) :: request_recv_scalar
+
+  integer :: IREGION
+  integer :: FORWARD_OR_ADJOINT
+
+  ! local parameters
+  integer :: iinterface
+
+  ! here we have to assemble all the contributions between partitions using MPI
+
+  ! assemble only if more than one partition
+  if(NPROC > 1) then
+
+    ! waits for communications completion (recv)
+    do iinterface = 1, num_interfaces
+      call wait_req(request_recv_scalar(iinterface))
+    enddo
+
+    ! sends contributions to GPU
+    call transfer_buffer_to_device_async(Mesh_pointer, &
+                                         buffer_recv_scalar, &
+                                         IREGION,FORWARD_OR_ADJOINT)
+  endif
+
+  ! This step is done via previous function transfer_and_assemble...
+  ! do iinterface = 1, num_interfaces
+  !   do ipoin = 1, nibool_interfaces(iinterface)
+  !     array_val(ibool_interfaces(ipoin,iinterface)) = &
+  !          array_val(ibool_interfaces(ipoin,iinterface)) + buffer_recv_scalar(ipoin,iinterface)
+  !   enddo
+  ! enddo
+
+  end subroutine transfer_boundarypot_to_device
+
+

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/assemble_MPI_vector.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/assemble_MPI_vector.f90	2014-01-06 21:22:48 UTC (rev 22990)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/assemble_MPI_vector.f90	2014-01-13 16:06:21 UTC (rev 22991)
@@ -98,7 +98,72 @@
 !-------------------------------------------------------------------------------------------------
 !
 
+  subroutine assemble_MPI_vector_w(NPROC,nglob, &
+                                           array_val, &
+                                           buffer_recv_vector, &
+                                           num_interfaces,max_nibool_interfaces, &
+                                           nibool_interfaces,ibool_interfaces, &
+                                           request_send_vector,request_recv_vector)
 
+! waits for data to receive and assembles
+
+  use constants
+
+  implicit none
+
+  integer :: NPROC
+  integer :: nglob
+
+  ! array to assemble
+  real(kind=CUSTOM_REAL), dimension(NDIM,nglob) :: array_val
+
+  integer :: num_interfaces,max_nibool_interfaces
+
+  real(kind=CUSTOM_REAL), dimension(NDIM,max_nibool_interfaces,num_interfaces) :: &
+       buffer_recv_vector
+
+  integer, dimension(num_interfaces) :: nibool_interfaces
+  integer, dimension(max_nibool_interfaces,num_interfaces) :: ibool_interfaces
+  integer, dimension(num_interfaces) :: request_send_vector,request_recv_vector
+
+  integer ipoin,iinterface
+
+! here we have to assemble all the contributions between partitions using MPI
+
+  ! assemble only if more than one partition
+  if(NPROC > 1) then
+
+    ! wait for communications completion (recv)
+    do iinterface = 1, num_interfaces
+      call wait_req(request_recv_vector(iinterface))
+    enddo
+
+    ! adding contributions of neighbours
+    do iinterface = 1, num_interfaces
+      do ipoin = 1, nibool_interfaces(iinterface)
+        array_val(:,ibool_interfaces(ipoin,iinterface)) = &
+             array_val(:,ibool_interfaces(ipoin,iinterface)) &
+             + buffer_recv_vector(:,ipoin,iinterface)
+      enddo
+    enddo
+
+    ! wait for communications completion (send)
+    do iinterface = 1, num_interfaces
+      call wait_req(request_send_vector(iinterface))
+    enddo
+
+  endif
+
+  end subroutine assemble_MPI_vector_w
+
+
+!-------------------------------------------------------------------------------------------------
+!
+! CUDA routines
+!
+!-------------------------------------------------------------------------------------------------
+
+
 ! interrupt might improve MPI performance
 ! see: https://computing.llnl.gov/tutorials/mpi_performance/#Sender-ReceiverSync
 !
@@ -110,8 +175,7 @@
                                           num_interfaces,max_nibool_interfaces, &
                                           nibool_interfaces, &
                                           my_neighbours, &
-                                          request_send_vector,request_recv_vector,&
-                                          IREGION,FORWARD_OR_ADJOINT)
+                                          request_send_vector,request_recv_vector)
 
   ! sends data
   ! note: array to assemble already filled into buffer_send_vector array
@@ -131,21 +195,15 @@
   integer, dimension(num_interfaces) :: nibool_interfaces,my_neighbours
   integer, dimension(num_interfaces) :: request_send_vector,request_recv_vector
 
-  integer :: IREGION
-  integer :: FORWARD_OR_ADJOINT
-
   ! local parameters
   integer :: iinterface
 
+  ! note: preparation of the contribution between partitions using MPI
+  !          already done in transfer_boun_from_device() routines
+
   ! send only if more than one partition
   if(NPROC > 1) then
 
-    ! preparation of the contribution between partitions using MPI
-    ! transfers mpi buffers to CPU
-    call transfer_boun_accel_from_device(Mesh_pointer, &
-                                         buffer_send_vector,&
-                                         IREGION,FORWARD_OR_ADJOINT)
-
     ! send messages
     do iinterface = 1, num_interfaces
       call isend_cr(buffer_send_vector(1,1,iinterface), &
@@ -169,12 +227,11 @@
 !-------------------------------------------------------------------------------------------------
 !
 
-  subroutine assemble_MPI_vector_w(NPROC,nglob, &
-                                           array_val, &
-                                           buffer_recv_vector, &
-                                           num_interfaces,max_nibool_interfaces, &
-                                           nibool_interfaces,ibool_interfaces, &
-                                           request_send_vector,request_recv_vector)
+  subroutine assemble_MPI_vector_write_cuda(Mesh_pointer,NPROC, &
+                                            buffer_recv_vector, &
+                                            num_interfaces,max_nibool_interfaces, &
+                                            request_send_vector,request_recv_vector, &
+                                            IREGION,FORWARD_OR_ADJOINT )
 
 ! waits for data to receive and assembles
 
@@ -182,23 +239,22 @@
 
   implicit none
 
+  integer(kind=8) :: Mesh_pointer
+
   integer :: NPROC
-  integer :: nglob
 
-  ! array to assemble
-  real(kind=CUSTOM_REAL), dimension(NDIM,nglob) :: array_val
-
   integer :: num_interfaces,max_nibool_interfaces
 
-  real(kind=CUSTOM_REAL), dimension(NDIM,max_nibool_interfaces,num_interfaces) :: &
-       buffer_recv_vector
-
-  integer, dimension(num_interfaces) :: nibool_interfaces
-  integer, dimension(max_nibool_interfaces,num_interfaces) :: ibool_interfaces
+  real(kind=CUSTOM_REAL), dimension(NDIM,max_nibool_interfaces,num_interfaces) :: buffer_recv_vector
   integer, dimension(num_interfaces) :: request_send_vector,request_recv_vector
 
-  integer ipoin,iinterface
+  integer :: IREGION
+  integer :: FORWARD_OR_ADJOINT
 
+  ! local parameters
+
+  integer :: iinterface
+
 ! here we have to assemble all the contributions between partitions using MPI
 
   ! assemble only if more than one partition
@@ -210,14 +266,18 @@
     enddo
 
     ! adding contributions of neighbours
-    do iinterface = 1, num_interfaces
-      do ipoin = 1, nibool_interfaces(iinterface)
-        array_val(:,ibool_interfaces(ipoin,iinterface)) = &
-             array_val(:,ibool_interfaces(ipoin,iinterface)) &
-             + buffer_recv_vector(:,ipoin,iinterface)
-      enddo
-    enddo
+    call transfer_asmbl_accel_to_device(Mesh_pointer, &
+                                        buffer_recv_vector, &
+                                        IREGION,FORWARD_OR_ADJOINT)
 
+    ! This step is done via previous function transfer_and_assemble...
+    ! do iinterface = 1, num_interfaces
+    !   do ipoin = 1, nibool_interfaces(iinterface)
+    !     array_val(:,ibool_interfaces(ipoin,iinterface)) = &
+    !          array_val(:,ibool_interfaces(ipoin,iinterface)) + buffer_recv_vector(:,ipoin,iinterface)
+    !   enddo
+    ! enddo
+
     ! wait for communications completion (send)
     do iinterface = 1, num_interfaces
       call wait_req(request_send_vector(iinterface))
@@ -225,21 +285,21 @@
 
   endif
 
-  end subroutine assemble_MPI_vector_w
+  end subroutine assemble_MPI_vector_write_cuda
 
 
 !
 !-------------------------------------------------------------------------------------------------
 !
 
-  subroutine assemble_MPI_vector_write_cuda(Mesh_pointer,NPROC, &
+! with cuda functions...
+
+  subroutine transfer_boundary_to_device(Mesh_pointer, NPROC, &
                                             buffer_recv_vector, &
-                                            num_interfaces,max_nibool_interfaces, &
-                                            request_send_vector,request_recv_vector, &
-                                            IREGION,FORWARD_OR_ADJOINT )
+                                            num_interfaces,max_nibool_interfaces,&
+                                            request_recv_vector, &
+                                            IREGION,FORWARD_OR_ADJOINT)
 
-! waits for data to receive and assembles
-
   use constants
 
   implicit none
@@ -248,47 +308,43 @@
 
   integer :: NPROC
 
+  ! array to assemble
   integer :: num_interfaces,max_nibool_interfaces
 
-  real(kind=CUSTOM_REAL), dimension(NDIM,max_nibool_interfaces,num_interfaces) :: buffer_recv_vector
-  integer, dimension(num_interfaces) :: request_send_vector,request_recv_vector
+  real(kind=CUSTOM_REAL), dimension(NDIM,max_nibool_interfaces,num_interfaces) :: &
+       buffer_recv_vector
 
+  integer, dimension(num_interfaces) :: request_recv_vector
+
   integer :: IREGION
   integer :: FORWARD_OR_ADJOINT
 
   ! local parameters
-
   integer :: iinterface
 
-! here we have to assemble all the contributions between partitions using MPI
+  ! here we have to assemble all the contributions between partitions using MPI
 
   ! assemble only if more than one partition
   if(NPROC > 1) then
 
-    ! wait for communications completion (recv)
+    ! waits for communications completion (recv)
     do iinterface = 1, num_interfaces
       call wait_req(request_recv_vector(iinterface))
     enddo
 
-    ! adding contributions of neighbours
-    call transfer_asmbl_accel_to_device(Mesh_pointer, &
-                                        buffer_recv_vector, &
-                                        IREGION,FORWARD_OR_ADJOINT)
+    ! sends contributions to GPU
+    call transfer_buffer_to_device_async(Mesh_pointer, &
+                                         buffer_recv_vector, &
+                                         IREGION,FORWARD_OR_ADJOINT)
+  endif
 
-    ! This step is done via previous function transfer_and_assemble...
-    ! do iinterface = 1, num_interfaces
-    !   do ipoin = 1, nibool_interfaces(iinterface)
-    !     array_val(:,ibool_interfaces(ipoin,iinterface)) = &
-    !          array_val(:,ibool_interfaces(ipoin,iinterface)) + buffer_recv_vector(:,ipoin,iinterface)
-    !   enddo
-    ! enddo
+  ! This step is done via previous function transfer_and_assemble...
+  ! do iinterface = 1, num_interfaces
+  !   do ipoin = 1, nibool_interfaces(iinterface)
+  !     array_val(:,ibool_interfaces(ipoin,iinterface)) = &
+  !          array_val(:,ibool_interfaces(ipoin,iinterface)) + buffer_recv_vector(:,ipoin,iinterface)
+  !   enddo
+  ! enddo
 
-    ! wait for communications completion (send)
-    do iinterface = 1, num_interfaces
-      call wait_req(request_send_vector(iinterface))
-    enddo
+  end subroutine transfer_boundary_to_device
 
-  endif
-
-  end subroutine assemble_MPI_vector_write_cuda
-

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_add_sources.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_add_sources.f90	2014-01-06 21:22:48 UTC (rev 22990)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_add_sources.f90	2014-01-13 16:06:21 UTC (rev 22991)
@@ -151,11 +151,14 @@
   implicit none
 
   ! local parameters
-  real(kind=CUSTOM_REAL), dimension(:,:,:,:,:), allocatable :: adj_sourcearray
-  integer :: irec,irec_local,i,j,k,iglob,it_sub_adj,itime
+  real(kind=CUSTOM_REAL), dimension(:,:,:,:,:), allocatable :: tmp_sourcearray
+  integer :: irec,irec_local,i,j,k,iglob,it_sub_adj,itime,ier
+  integer :: ivec_index
   character(len=150) :: adj_source_file
   logical :: ibool_read_adj_arrays
 
+  ! note: we check if nadj_rec_local > 0 before calling this routine
+
   ! figure out if we need to read in a chunk of the adjoint source at this timestep
   it_sub_adj = ceiling( dble(it)/dble(NTSTEP_BETWEEN_READ_ADJSRC) )   !chunk_number
   ibool_read_adj_arrays = (((it == it_begin) .or. (mod(it-1,NTSTEP_BETWEEN_READ_ADJSRC) == 0)) &
@@ -163,12 +166,13 @@
 
   ! needs to read in a new chunk/block of the adjoint source
   if(ibool_read_adj_arrays) then
+    ! allocates temporary source array
+    allocate(tmp_sourcearray(NDIM,NGLLX,NGLLY,NGLLZ,NTSTEP_BETWEEN_READ_ADJSRC),stat=ier)
+    if( ier /= 0 ) call exit_MPI(myrank,'error allocating array tmp_sourcearray')
 
-    ! temporary source array
-    allocate(adj_sourcearray(NDIM,NGLLX,NGLLY,NGLLZ,NTSTEP_BETWEEN_READ_ADJSRC))
-    adj_sourcearray = 0._CUSTOM_REAL
+    tmp_sourcearray(:,:,:,:,:) = 0._CUSTOM_REAL
+    irec_local = 0
 
-    irec_local = 0
     do irec = 1, nrec
       ! check that the source slice number is okay
       if(islice_selected_rec(irec) < 0 .or. islice_selected_rec(irec) > NPROCTOT_VAL-1) then
@@ -184,23 +188,25 @@
         adj_source_file = trim(station_name(irec))//'.'//trim(network_name(irec))
         call compute_arrays_source_adjoint(myrank,adj_source_file, &
                   xi_receiver(irec),eta_receiver(irec),gamma_receiver(irec), &
-                  nu(:,:,irec),adj_sourcearray, xigll,yigll,zigll,iadjsrc_len(it_sub_adj), &
+                  nu(:,:,irec),tmp_sourcearray, xigll,yigll,zigll,iadjsrc_len(it_sub_adj), &
                   iadjsrc,it_sub_adj,NSTEP_SUB_ADJ,NTSTEP_BETWEEN_READ_ADJSRC,DT)
 
         ! stores source array
         ! note: the adj_sourcearrays has a time stepping from 1 to NTSTEP_BETWEEN_READ_ADJSRC
         !          this gets overwritten every time a new block/chunk is read in
         do itime = 1,NTSTEP_BETWEEN_READ_ADJSRC
-          adj_sourcearrays(:,:,:,:,irec_local,itime) = adj_sourcearray(:,:,:,:,itime)
+          adj_sourcearrays(:,:,:,:,irec_local,itime) = tmp_sourcearray(:,:,:,:,itime)
         enddo
 
       endif
     enddo
+
+    ! checks that number of read sources is valid
     if(irec_local /= nadj_rec_local) &
       call exit_MPI(myrank,'irec_local /= nadj_rec_local in adjoint simulation')
 
-    deallocate(adj_sourcearray)
-
+    ! frees temporary array
+    deallocate(tmp_sourcearray)
   endif
 
   ! adds adjoint sources
@@ -213,6 +219,9 @@
       if(myrank == islice_selected_rec(irec)) then
         irec_local = irec_local + 1
 
+        ! adjoint source array index
+        ivec_index = iadj_vec(it)
+
         ! adds source contributions
         do k=1,NGLLZ
           do j=1,NGLLY
@@ -277,7 +286,7 @@
               !           assuming that until that end the backward/reconstructed wavefield and adjoint fields
               !           have a zero contribution to adjoint kernels.
               accel_crust_mantle(:,iglob) = accel_crust_mantle(:,iglob) &
-                            + adj_sourcearrays(:,i,j,k,irec_local,iadj_vec(it))
+                            + adj_sourcearrays(:,i,j,k,irec_local,ivec_index)
 
             enddo
           enddo
@@ -287,10 +296,54 @@
     enddo
 
   else
+
     ! on GPU
-    call compute_add_sources_adjoint_cuda(Mesh_pointer,nrec,adj_sourcearrays, &
-                                          islice_selected_rec,ispec_selected_rec, &
-                                          iadj_vec(it))
+    ! note: adjoint sourcearrays can become very big when used with many receiver stations
+    !       we overlap here the memory transfer to GPUs
+
+    ! current time index
+    ivec_index = iadj_vec(it)
+
+    if( GPU_ASYNC_COPY ) then
+      ! only synchronuously transfers array at beginning or whenever new arrays were read in
+      if( ibool_read_adj_arrays ) then
+        ! transfers adjoint arrays to GPU device memory
+        ! note: function call passes pointer to array adj_sourcearrays at corresponding time slice
+        call transfer_adj_to_device(Mesh_pointer,nrec,adj_sourcearrays(1,1,1,1,1,ivec_index), &
+                                    islice_selected_rec)
+      endif
+    else
+      ! synchronuously transfers adjoint arrays to GPU device memory before adding adjoint sources on GPU
+      call transfer_adj_to_device(Mesh_pointer,nrec,adj_sourcearrays(1,1,1,1,1,ivec_index), &
+                                  islice_selected_rec)
+    endif
+
+    ! adds adjoint source contributions
+    call compute_add_sources_adjoint_cuda(Mesh_pointer,nrec)
+
+    if( GPU_ASYNC_COPY ) then
+      ! starts asynchronuously transfer of next adjoint arrays to GPU device memory
+      ! (making sure the next adj_sourcearrays values were already read in)
+      if( (.not. ibool_read_adj_arrays) .and. &
+          (.not. mod(it,NTSTEP_BETWEEN_READ_ADJSRC) == 0) .and. &
+          (.not. it == it_end) ) then
+        ! next time index
+        ivec_index = iadj_vec(it+1)
+
+        ! checks next index
+        if( ivec_index < 1 .or. ivec_index > NTSTEP_BETWEEN_READ_ADJSRC ) then
+          print*,'error iadj_vec bounds: rank',myrank,' it = ',it,' index = ',ivec_index, &
+                 'out of bounds ',1,'to',NTSTEP_BETWEEN_READ_ADJSRC
+          call exit_MPI(myrank,'error iadj_vec index bounds')
+        endif
+
+        ! asynchronuously transfers next time slice
+        call transfer_adj_to_device_async(Mesh_pointer,nrec,adj_sourcearrays(1,1,1,1,1,ivec_index), &
+                                          islice_selected_rec)
+      endif
+    endif
+
+
   endif
 
   end subroutine compute_add_sources_adjoint

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_forces_acoustic_calling_routine.F90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_forces_acoustic_calling_routine.F90	2014-01-06 21:22:48 UTC (rev 22990)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_forces_acoustic_calling_routine.F90	2014-01-13 16:06:21 UTC (rev 22991)
@@ -104,6 +104,20 @@
       ! on GPU
       ! includes FORWARD_OR_ADJOINT == 1
       call compute_forces_outer_core_cuda(Mesh_pointer,iphase,time,1)
+
+      ! initiates asynchronuous mpi transfer
+      if( GPU_ASYNC_COPY .and. iphase == 2 ) then
+        ! crust/mantle region
+        ! wait for asynchronous copy to finish
+        call sync_copy_from_device(Mesh_pointer,iphase,buffer_send_scalar_outer_core,IREGION_OUTER_CORE,1)
+        ! sends mpi buffers
+        call assemble_MPI_scalar_send_cuda(Mesh_pointer,NPROCTOT_VAL, &
+                                           buffer_send_scalar_outer_core,buffer_recv_scalar_outer_core, &
+                                           num_interfaces_outer_core,max_nibool_interfaces_oc, &
+                                           nibool_interfaces_outer_core,&
+                                           my_neighbours_outer_core, &
+                                           request_send_scalar_oc,request_recv_scalar_oc)
+      endif
     endif
 
 
@@ -172,14 +186,25 @@
                                 request_send_scalar_oc,request_recv_scalar_oc)
       else
         ! on GPU
-        ! outer core
-        call assemble_MPI_scalar_send_cuda(Mesh_pointer,NPROCTOT_VAL, &
-                                           buffer_send_scalar_outer_core,buffer_recv_scalar_outer_core, &
-                                           num_interfaces_outer_core,max_nibool_interfaces_oc, &
-                                           nibool_interfaces_outer_core,&
-                                           my_neighbours_outer_core, &
-                                           request_send_scalar_oc,request_recv_scalar_oc, &
-                                           1) ! <-- 1 == fwd accel
+        ! sends accel values to corresponding MPI interface neighbors
+
+        ! preparation of the contribution between partitions using MPI
+        ! transfers mpi buffers to CPU
+        ! note: for asynchronuous transfers, this transfers boundary region to host asynchronously. The
+        !       MPI-send is done after compute_forces_outer_core_cuda,
+        !       once the inner element kernels are launched, and the memcpy has finished.
+        call transfer_boun_pot_from_device(Mesh_pointer,buffer_send_scalar_outer_core,1)
+
+        if( .not. GPU_ASYNC_COPY ) then
+          ! for synchronuous transfers, sending over mpi can directly proceed
+          ! outer core
+          call assemble_MPI_scalar_send_cuda(Mesh_pointer,NPROCTOT_VAL, &
+                                             buffer_send_scalar_outer_core,buffer_recv_scalar_outer_core, &
+                                             num_interfaces_outer_core,max_nibool_interfaces_oc, &
+                                             nibool_interfaces_outer_core,&
+                                             my_neighbours_outer_core, &
+                                             request_send_scalar_oc,request_recv_scalar_oc)
+        endif
       endif
     else
       ! make sure the last communications are finished and processed
@@ -194,6 +219,18 @@
                                    request_send_scalar_oc,request_recv_scalar_oc)
       else
         ! on GPU
+        if( GPU_ASYNC_COPY ) then
+          ! while inner elements compute "Kernel_2", we wait for MPI to
+          ! finish and transfer the boundary terms to the device asynchronously
+          !
+          ! transfers mpi buffers onto GPU
+          call transfer_boundarypot_to_device(Mesh_pointer,NPROCTOT_VAL,buffer_recv_scalar_outer_core, &
+                                              num_interfaces_outer_core,max_nibool_interfaces_oc, &
+                                              request_recv_scalar_oc, &
+                                              IREGION_OUTER_CORE,1)
+        endif
+
+        ! waits for mpi send/receive requests to be completed and assembles values
         call assemble_MPI_scalar_write_cuda(Mesh_pointer,NPROCTOT_VAL, &
                                             buffer_recv_scalar_outer_core, &
                                             num_interfaces_outer_core,max_nibool_interfaces_oc, &
@@ -328,6 +365,21 @@
       ! on GPU
       ! includes FORWARD_OR_ADJOINT == 3
       call compute_forces_outer_core_cuda(Mesh_pointer,iphase,b_time,3)
+
+      ! initiates asynchronuous mpi transfer
+      if( GPU_ASYNC_COPY .and. iphase == 2 ) then
+        ! crust/mantle region
+        ! wait for asynchronous copy to finish
+        call sync_copy_from_device(Mesh_pointer,iphase,b_buffer_send_scalar_outer_core,IREGION_OUTER_CORE,3)
+        ! sends mpi buffers
+        call assemble_MPI_scalar_send_cuda(Mesh_pointer,NPROCTOT_VAL, &
+                              b_buffer_send_scalar_outer_core,b_buffer_recv_scalar_outer_core, &
+                              num_interfaces_outer_core,max_nibool_interfaces_oc, &
+                              nibool_interfaces_outer_core,&
+                              my_neighbours_outer_core, &
+                              b_request_send_scalar_oc,b_request_recv_scalar_oc)
+      endif
+
     endif
 
 
@@ -402,14 +454,25 @@
                               b_request_send_scalar_oc,b_request_recv_scalar_oc)
       else
         ! on GPU
-        ! outer core
-        call assemble_MPI_scalar_send_cuda(Mesh_pointer,NPROCTOT_VAL, &
-                              b_buffer_send_scalar_outer_core,b_buffer_recv_scalar_outer_core, &
-                              num_interfaces_outer_core,max_nibool_interfaces_oc, &
-                              nibool_interfaces_outer_core,&
-                              my_neighbours_outer_core, &
-                              b_request_send_scalar_oc,b_request_recv_scalar_oc, &
-                              3) ! <-- 3 == adjoint b_accel
+        ! sends accel values to corresponding MPI interface neighbors
+
+        ! preparation of the contribution between partitions using MPI
+        ! transfers mpi buffers to CPU
+        ! note: for asynchronuous transfers, this transfers boundary region to host asynchronously. The
+        !       MPI-send is done after compute_forces_outer_core_cuda,
+        !       once the inner element kernels are launched, and the memcpy has finished.
+        call transfer_boun_pot_from_device(Mesh_pointer,b_buffer_send_scalar_outer_core,3)
+
+        if( .not. GPU_ASYNC_COPY ) then
+          ! for synchronuous transfers, sending over mpi can directly proceed
+          ! outer core
+          call assemble_MPI_scalar_send_cuda(Mesh_pointer,NPROCTOT_VAL, &
+                                b_buffer_send_scalar_outer_core,b_buffer_recv_scalar_outer_core, &
+                                num_interfaces_outer_core,max_nibool_interfaces_oc, &
+                                nibool_interfaces_outer_core,&
+                                my_neighbours_outer_core, &
+                                b_request_send_scalar_oc,b_request_recv_scalar_oc)
+        endif
       endif ! GPU
     else
       ! make sure the last communications are finished and processed
@@ -425,6 +488,18 @@
                                    b_request_send_scalar_oc,b_request_recv_scalar_oc)
       else
         ! on GPU
+        if( GPU_ASYNC_COPY ) then
+          ! while inner elements compute "Kernel_2", we wait for MPI to
+          ! finish and transfer the boundary terms to the device asynchronously
+          !
+          ! transfers mpi buffers onto GPU
+          call transfer_boundarypot_to_device(Mesh_pointer,NPROCTOT_VAL,b_buffer_recv_scalar_outer_core, &
+                                              num_interfaces_outer_core,max_nibool_interfaces_oc, &
+                                              b_request_recv_scalar_oc, &
+                                              IREGION_OUTER_CORE,3)
+        endif
+
+        ! waits for mpi send/receive requests to be completed and assembles values
         call assemble_MPI_scalar_write_cuda(Mesh_pointer,NPROCTOT_VAL, &
                                             b_buffer_recv_scalar_outer_core, &
                                             num_interfaces_outer_core,max_nibool_interfaces_oc, &

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_forces_viscoelastic_calling_routine.F90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_forces_viscoelastic_calling_routine.F90	2014-01-06 21:22:48 UTC (rev 22990)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_forces_viscoelastic_calling_routine.F90	2014-01-13 16:06:21 UTC (rev 22991)
@@ -152,12 +152,44 @@
       ! contains forward FORWARD_OR_ADJOINT == 1
       ! for crust/mantle
       call compute_forces_crust_mantle_cuda(Mesh_pointer,iphase,1)
+
+      ! initiates asynchronuous mpi transfer
+      if( GPU_ASYNC_COPY .and. iphase == 2 ) then
+        ! crust/mantle region
+        ! wait for asynchronous copy to finish
+        call sync_copy_from_device(Mesh_pointer,iphase,buffer_send_vector_crust_mantle,IREGION_CRUST_MANTLE,1)
+        ! sends mpi buffers
+        call assemble_MPI_vector_send_cuda(Mesh_pointer,NPROCTOT_VAL, &
+                      buffer_send_vector_crust_mantle,buffer_recv_vector_crust_mantle, &
+                      num_interfaces_crust_mantle,max_nibool_interfaces_cm, &
+                      nibool_interfaces_crust_mantle,&
+                      my_neighbours_crust_mantle, &
+                      request_send_vector_cm,request_recv_vector_cm)
+      endif
+
       ! for inner core
       call compute_forces_inner_core_cuda(Mesh_pointer,iphase,1)
+
+      ! initiates asynchronuous mpi transfer
+      if( GPU_ASYNC_COPY .and. iphase == 2 ) then
+        ! inner core region
+        ! wait for asynchronous copy to finish
+        call sync_copy_from_device(Mesh_pointer,iphase,buffer_send_vector_inner_core,IREGION_INNER_CORE,1)
+        ! sends mpi buffers
+        call assemble_MPI_vector_send_cuda(Mesh_pointer,NPROCTOT_VAL, &
+                      buffer_send_vector_inner_core,buffer_recv_vector_inner_core, &
+                      num_interfaces_inner_core,max_nibool_interfaces_ic, &
+                      nibool_interfaces_inner_core,&
+                      my_neighbours_inner_core, &
+                      request_send_vector_ic,request_recv_vector_ic)
+      endif
     endif ! GPU_MODE
 
+
+
     ! computes additional contributions to acceleration field
     if( iphase == 1 ) then
+       ! during phase for outer elements
 
        ! absorbing boundaries
        ! Stacey
@@ -272,22 +304,37 @@
                       request_send_vector_ic,request_recv_vector_ic)
       else
         ! on GPU
-        ! crust mantle
-        call assemble_MPI_vector_send_cuda(Mesh_pointer,NPROCTOT_VAL, &
-                      buffer_send_vector_crust_mantle,buffer_recv_vector_crust_mantle, &
-                      num_interfaces_crust_mantle,max_nibool_interfaces_cm, &
-                      nibool_interfaces_crust_mantle,&
-                      my_neighbours_crust_mantle, &
-                      request_send_vector_cm,request_recv_vector_cm, &
-                      IREGION_CRUST_MANTLE,1) ! <-- 1 == fwd accel
-        ! inner core
-        call assemble_MPI_vector_send_cuda(Mesh_pointer,NPROCTOT_VAL, &
-                      buffer_send_vector_inner_core,buffer_recv_vector_inner_core, &
-                      num_interfaces_inner_core,max_nibool_interfaces_ic, &
-                      nibool_interfaces_inner_core,&
-                      my_neighbours_inner_core, &
-                      request_send_vector_ic,request_recv_vector_ic, &
-                      IREGION_INNER_CORE,1)
+        ! sends accel values to corresponding MPI interface neighbors
+
+        ! preparation of the contribution between partitions using MPI
+        ! transfers mpi buffers to CPU
+        ! note: in case of asynchronuous copy, this transfers boundary region to host asynchronously. The
+        !       MPI-send is done after compute_forces_viscoelastic_cuda,
+        !       once the inner element kernels are launched, and the memcpy has finished.
+        call transfer_boun_from_device(Mesh_pointer, &
+                                       buffer_send_vector_crust_mantle,&
+                                       IREGION_CRUST_MANTLE,1)
+        call transfer_boun_from_device(Mesh_pointer, &
+                                       buffer_send_vector_inner_core,&
+                                       IREGION_INNER_CORE,1)
+
+        if( .not. GPU_ASYNC_COPY ) then
+          ! for synchronuous transfers, sending over mpi can directly proceed
+          ! crust mantle
+          call assemble_MPI_vector_send_cuda(Mesh_pointer,NPROCTOT_VAL, &
+                        buffer_send_vector_crust_mantle,buffer_recv_vector_crust_mantle, &
+                        num_interfaces_crust_mantle,max_nibool_interfaces_cm, &
+                        nibool_interfaces_crust_mantle,&
+                        my_neighbours_crust_mantle, &
+                        request_send_vector_cm,request_recv_vector_cm)
+          ! inner core
+          call assemble_MPI_vector_send_cuda(Mesh_pointer,NPROCTOT_VAL, &
+                        buffer_send_vector_inner_core,buffer_recv_vector_inner_core, &
+                        num_interfaces_inner_core,max_nibool_interfaces_ic, &
+                        nibool_interfaces_inner_core,&
+                        my_neighbours_inner_core, &
+                        request_send_vector_ic,request_recv_vector_ic)
+        endif
       endif ! GPU_MODE
     else
       ! waits for send/receive requests to be completed and assembles values
@@ -309,6 +356,24 @@
                               request_send_vector_ic,request_recv_vector_ic)
       else
         ! on GPU
+        if( GPU_ASYNC_COPY ) then
+          ! while inner elements compute "Kernel_2", we wait for MPI to
+          ! finish and transfer the boundary terms to the device asynchronously
+          !
+          ! transfers mpi buffers onto GPU
+          ! crust/mantle region
+          call transfer_boundary_to_device(Mesh_pointer,NPROCTOT_VAL,buffer_recv_vector_crust_mantle, &
+                                           num_interfaces_crust_mantle,max_nibool_interfaces_cm, &
+                                           request_recv_vector_cm, &
+                                           IREGION_CRUST_MANTLE,1)
+          ! inner core region
+          call transfer_boundary_to_device(Mesh_pointer,NPROCTOT_VAL,buffer_recv_vector_inner_core, &
+                                           num_interfaces_inner_core,max_nibool_interfaces_ic, &
+                                           request_recv_vector_ic, &
+                                           IREGION_INNER_CORE,1)
+        endif
+
+        ! waits for mpi send/receive requests to be completed and assembles values
         ! crust mantle
         call assemble_MPI_vector_write_cuda(Mesh_pointer,NPROCTOT_VAL, &
                             buffer_recv_vector_crust_mantle, &
@@ -537,8 +602,39 @@
       ! contains forward FORWARD_OR_ADJOINT == 3
       ! for crust/mantle
       call compute_forces_crust_mantle_cuda(Mesh_pointer,iphase,3)
+
+      ! initiates asynchronuous mpi transfer
+      if( GPU_ASYNC_COPY .and. iphase == 2 ) then
+        ! crust/mantle region
+        ! wait for asynchronous copy to finish
+        call sync_copy_from_device(Mesh_pointer,iphase,b_buffer_send_vector_cm,IREGION_CRUST_MANTLE,3)
+        ! sends mpi buffers
+        call assemble_MPI_vector_send_cuda(Mesh_pointer,NPROCTOT_VAL, &
+                      b_buffer_send_vector_cm,b_buffer_recv_vector_cm, &
+                      num_interfaces_crust_mantle,max_nibool_interfaces_cm, &
+                      nibool_interfaces_crust_mantle,&
+                      my_neighbours_crust_mantle, &
+                      b_request_send_vector_cm,b_request_recv_vector_cm)
+      endif
+
       ! for inner core
       call compute_forces_inner_core_cuda(Mesh_pointer,iphase,3)
+
+      ! initiates asynchronuous mpi transfer
+      if( GPU_ASYNC_COPY .and. iphase == 2 ) then
+        ! inner core region
+        ! wait for asynchronous copy to finish
+        call sync_copy_from_device(Mesh_pointer,iphase,b_buffer_send_vector_inner_core,IREGION_INNER_CORE,3)
+
+        ! sends mpi buffers
+        call assemble_MPI_vector_send_cuda(Mesh_pointer,NPROCTOT_VAL, &
+                      b_buffer_send_vector_inner_core,b_buffer_recv_vector_inner_core, &
+                      num_interfaces_inner_core,max_nibool_interfaces_ic, &
+                      nibool_interfaces_inner_core,&
+                      my_neighbours_inner_core, &
+                      b_request_send_vector_ic,b_request_recv_vector_ic)
+      endif
+
     endif ! GPU_MODE
 
     ! computes additional contributions to acceleration field
@@ -644,25 +740,39 @@
                       b_request_send_vector_ic,b_request_recv_vector_ic)
       else
         ! on GPU
-        ! crust mantle
-        call assemble_MPI_vector_send_cuda(Mesh_pointer,NPROCTOT_VAL, &
-                    b_buffer_send_vector_cm,b_buffer_recv_vector_cm, &
-                    num_interfaces_crust_mantle,max_nibool_interfaces_cm, &
-                    nibool_interfaces_crust_mantle,&
-                    my_neighbours_crust_mantle, &
-                    b_request_send_vector_cm,b_request_recv_vector_cm, &
-                    IREGION_CRUST_MANTLE,3) ! <-- 3 == adjoint b_accel
-        ! inner core
-        call assemble_MPI_vector_send_cuda(Mesh_pointer,NPROCTOT_VAL, &
-                    b_buffer_send_vector_inner_core,b_buffer_recv_vector_inner_core, &
-                    num_interfaces_inner_core,max_nibool_interfaces_ic, &
-                    nibool_interfaces_inner_core,&
-                    my_neighbours_inner_core, &
-                    b_request_send_vector_ic,b_request_recv_vector_ic, &
-                    IREGION_INNER_CORE,3)
+        ! sends accel values to corresponding MPI interface neighbors
+
+        ! preparation of the contribution between partitions using MPI
+        ! transfers mpi buffers to CPU
+        ! note: in case of asynchronuous copy, this transfers boundary region to host asynchronously. The
+        !       MPI-send is done after compute_forces_viscoelastic_cuda,
+        !       once the inner element kernels are launched, and the memcpy has finished.
+        call transfer_boun_from_device(Mesh_pointer, &
+                                       b_buffer_send_vector_cm,&
+                                       IREGION_CRUST_MANTLE,3)
+        call transfer_boun_from_device(Mesh_pointer, &
+                                       b_buffer_send_vector_inner_core,&
+                                       IREGION_INNER_CORE,3)
+
+        if( .not. GPU_ASYNC_COPY ) then
+          ! for synchronuous transfers, sending over mpi can directly proceed
+          ! crust mantle
+          call assemble_MPI_vector_send_cuda(Mesh_pointer,NPROCTOT_VAL, &
+                      b_buffer_send_vector_cm,b_buffer_recv_vector_cm, &
+                      num_interfaces_crust_mantle,max_nibool_interfaces_cm, &
+                      nibool_interfaces_crust_mantle,&
+                      my_neighbours_crust_mantle, &
+                      b_request_send_vector_cm,b_request_recv_vector_cm)
+          ! inner core
+          call assemble_MPI_vector_send_cuda(Mesh_pointer,NPROCTOT_VAL, &
+                      b_buffer_send_vector_inner_core,b_buffer_recv_vector_inner_core, &
+                      num_interfaces_inner_core,max_nibool_interfaces_ic, &
+                      nibool_interfaces_inner_core,&
+                      my_neighbours_inner_core, &
+                      b_request_send_vector_ic,b_request_recv_vector_ic)
+        endif
       endif ! GPU
     else
-      ! waits for send/receive requests to be completed and assembles values
       ! adjoint / kernel runs
       ! waits for send/receive requests to be completed and assembles values
       if(.NOT. GPU_MODE) then
@@ -684,6 +794,25 @@
 
       else
         ! on GPU
+        if( GPU_ASYNC_COPY ) then
+          ! while inner elements compute "Kernel_2", we wait for MPI to
+          ! finish and transfer the boundary terms to the device asynchronously
+          ! wait for asynchronous copy to finish
+          !
+          ! transfers mpi buffers onto GPU
+          ! crust/mantle region
+          call transfer_boundary_to_device(Mesh_pointer,NPROCTOT_VAL,b_buffer_recv_vector_cm, &
+                                           num_interfaces_crust_mantle,max_nibool_interfaces_cm, &
+                                           b_request_recv_vector_cm, &
+                                           IREGION_CRUST_MANTLE,3)
+          ! inner core region
+          call transfer_boundary_to_device(Mesh_pointer,NPROCTOT_VAL,b_buffer_recv_vector_inner_core, &
+                                           num_interfaces_inner_core,max_nibool_interfaces_ic, &
+                                           b_request_recv_vector_ic, &
+                                           IREGION_INNER_CORE,3)
+        endif
+
+        ! waits for mpi send/receive requests to be completed and assembles values
         ! crust mantle
         call assemble_MPI_vector_write_cuda(Mesh_pointer,NPROCTOT_VAL, &
                           b_buffer_recv_vector_cm, &

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_stacey_outer_core.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_stacey_outer_core.f90	2014-01-06 21:22:48 UTC (rev 22990)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_stacey_outer_core.f90	2014-01-13 16:06:21 UTC (rev 22991)
@@ -76,216 +76,212 @@
 
   !   xmin
   ! if two chunks exclude this face for one of them
-  if(NCHUNKS_VAL == 1 .or. ichunk == CHUNK_AC) then
+  if( NCHUNKS_VAL == 1 .or. ichunk == CHUNK_AC ) then
+    if( nspec2D_xmin_outer_core > 0 ) then
+      ! adds boundary contributions
+      if( .NOT. GPU_MODE) then
+        ! on CPU
+        do ispec2D=1,nspec2D_xmin_outer_core
 
-    if( .NOT. GPU_MODE) then
-      ! on CPU
-      do ispec2D=1,nspec2D_xmin_outer_core
+          ispec=ibelm_xmin_outer_core(ispec2D)
 
-        ispec=ibelm_xmin_outer_core(ispec2D)
+          ! exclude elements that are not on absorbing edges
+          if(nkmin_xi_outer_core(1,ispec2D) == 0 .or. njmin_outer_core(1,ispec2D) == 0) cycle
 
-        ! exclude elements that are not on absorbing edges
-        if(nkmin_xi_outer_core(1,ispec2D) == 0 .or. njmin_outer_core(1,ispec2D) == 0) cycle
+          i=1
+          do k=nkmin_xi_outer_core(1,ispec2D),NGLLZ
+            do j=njmin_outer_core(1,ispec2D),njmax_outer_core(1,ispec2D)
+              iglob=ibool_outer_core(i,j,k,ispec)
 
-        i=1
-        do k=nkmin_xi_outer_core(1,ispec2D),NGLLZ
-          do j=njmin_outer_core(1,ispec2D),njmax_outer_core(1,ispec2D)
-            iglob=ibool_outer_core(i,j,k,ispec)
+              sn = veloc_outer_core(iglob)/vp_outer_core(i,j,k,ispec)
 
-            sn = veloc_outer_core(iglob)/vp_outer_core(i,j,k,ispec)
+              weight = jacobian2D_xmin_outer_core(j,k,ispec2D)*wgllwgll_yz(j,k)
 
-            weight = jacobian2D_xmin_outer_core(j,k,ispec2D)*wgllwgll_yz(j,k)
+              accel_outer_core(iglob) = accel_outer_core(iglob) - weight*sn
 
-            accel_outer_core(iglob) = accel_outer_core(iglob) - weight*sn
-
-            if( SAVE_FORWARD ) then
-              absorb_xmin_outer_core(j,k,ispec2D) = weight*sn
-            endif
+              if( SAVE_FORWARD ) then
+                absorb_xmin_outer_core(j,k,ispec2D) = weight*sn
+              endif
+            enddo
           enddo
         enddo
-      enddo
 
-    else
-      ! on GPU
-      if( nspec2D_xmin_outer_core > 0 ) call compute_stacey_acoustic_cuda(Mesh_pointer, &
-                                                                absorb_xmin_outer_core, &
-                                                                4) ! <= xmin
-    endif
+      else
+        ! on GPU
+        call compute_stacey_acoustic_cuda(Mesh_pointer,absorb_xmin_outer_core,4) ! <= xmin
+      endif
 
-    ! writes absorbing boundary values
-    if( SAVE_FORWARD .and. nspec2D_xmin_outer_core > 0 ) then
-      call write_abs(4,absorb_xmin_outer_core,reclen_xmin_outer_core,it)
+      ! writes absorbing boundary values to file
+      if( SAVE_FORWARD ) then
+        call write_abs(4,absorb_xmin_outer_core,reclen_xmin_outer_core,it)
+      endif
     endif
-
   endif
 
   !   xmax
   ! if two chunks exclude this face for one of them
-  if(NCHUNKS_VAL == 1 .or. ichunk == CHUNK_AB) then
+  if( NCHUNKS_VAL == 1 .or. ichunk == CHUNK_AB ) then
+    if( nspec2D_xmax_outer_core > 0 ) then
+      ! adds boundary contributions
+      if( .NOT. GPU_MODE ) then
+        ! on CPU
+        do ispec2D=1,nspec2D_xmax_outer_core
 
-    if( .NOT. GPU_MODE ) then
-      ! on CPU
-      do ispec2D=1,nspec2D_xmax_outer_core
+          ispec=ibelm_xmax_outer_core(ispec2D)
 
-        ispec=ibelm_xmax_outer_core(ispec2D)
+          ! exclude elements that are not on absorbing edges
+          if(nkmin_xi_outer_core(2,ispec2D) == 0 .or. njmin_outer_core(2,ispec2D) == 0) cycle
 
-        ! exclude elements that are not on absorbing edges
-        if(nkmin_xi_outer_core(2,ispec2D) == 0 .or. njmin_outer_core(2,ispec2D) == 0) cycle
+          i=NGLLX
+          do k=nkmin_xi_outer_core(2,ispec2D),NGLLZ
+            do j=njmin_outer_core(2,ispec2D),njmax_outer_core(2,ispec2D)
+              iglob=ibool_outer_core(i,j,k,ispec)
 
-        i=NGLLX
-        do k=nkmin_xi_outer_core(2,ispec2D),NGLLZ
-          do j=njmin_outer_core(2,ispec2D),njmax_outer_core(2,ispec2D)
-            iglob=ibool_outer_core(i,j,k,ispec)
+              sn = veloc_outer_core(iglob)/vp_outer_core(i,j,k,ispec)
 
-            sn = veloc_outer_core(iglob)/vp_outer_core(i,j,k,ispec)
+              weight = jacobian2D_xmax_outer_core(j,k,ispec2D)*wgllwgll_yz(j,k)
 
-            weight = jacobian2D_xmax_outer_core(j,k,ispec2D)*wgllwgll_yz(j,k)
+              accel_outer_core(iglob) = accel_outer_core(iglob) - weight*sn
 
-            accel_outer_core(iglob) = accel_outer_core(iglob) - weight*sn
+              if( SAVE_FORWARD ) then
+                absorb_xmax_outer_core(j,k,ispec2D) = weight*sn
+              endif
 
-            if( SAVE_FORWARD ) then
-              absorb_xmax_outer_core(j,k,ispec2D) = weight*sn
-            endif
-
+            enddo
           enddo
         enddo
-      enddo
 
-    else
-      ! on GPU
-      if( nspec2D_xmax_outer_core > 0 ) call compute_stacey_acoustic_cuda(Mesh_pointer, &
-                                                                absorb_xmax_outer_core, &
-                                                                5) ! <= xmax
-    endif
+      else
+        ! on GPU
+        call compute_stacey_acoustic_cuda(Mesh_pointer,absorb_xmax_outer_core,5) ! <= xmax
+      endif
 
-    if( SAVE_FORWARD .and. nspec2D_xmax_outer_core > 0 ) then
-      call write_abs(5,absorb_xmax_outer_core,reclen_xmax_outer_core,it)
+      if( SAVE_FORWARD ) then
+        call write_abs(5,absorb_xmax_outer_core,reclen_xmax_outer_core,it)
+      endif
     endif
 
   endif
 
   !   ymin
+  if( nspec2D_ymin_outer_core > 0 ) then
+    ! adds boundary contributions
+    if( .NOT. GPU_MODE ) then
+      ! on CPU
+      do ispec2D=1,nspec2D_ymin_outer_core
 
-  if( .NOT. GPU_MODE ) then
-    ! on CPU
-    do ispec2D=1,nspec2D_ymin_outer_core
+        ispec=ibelm_ymin_outer_core(ispec2D)
 
-      ispec=ibelm_ymin_outer_core(ispec2D)
+        ! exclude elements that are not on absorbing edges
+        if(nkmin_eta_outer_core(1,ispec2D) == 0 .or. nimin_outer_core(1,ispec2D) == 0) cycle
 
-      ! exclude elements that are not on absorbing edges
-      if(nkmin_eta_outer_core(1,ispec2D) == 0 .or. nimin_outer_core(1,ispec2D) == 0) cycle
+        j=1
+        do k=nkmin_eta_outer_core(1,ispec2D),NGLLZ
+          do i=nimin_outer_core(1,ispec2D),nimax_outer_core(1,ispec2D)
+            iglob=ibool_outer_core(i,j,k,ispec)
 
-      j=1
-      do k=nkmin_eta_outer_core(1,ispec2D),NGLLZ
-        do i=nimin_outer_core(1,ispec2D),nimax_outer_core(1,ispec2D)
-          iglob=ibool_outer_core(i,j,k,ispec)
+            sn = veloc_outer_core(iglob)/vp_outer_core(i,j,k,ispec)
 
-          sn = veloc_outer_core(iglob)/vp_outer_core(i,j,k,ispec)
+            weight=jacobian2D_ymin_outer_core(i,k,ispec2D)*wgllwgll_xz(i,k)
 
-          weight=jacobian2D_ymin_outer_core(i,k,ispec2D)*wgllwgll_xz(i,k)
+            accel_outer_core(iglob) = accel_outer_core(iglob) - weight*sn
 
-          accel_outer_core(iglob) = accel_outer_core(iglob) - weight*sn
+            if( SAVE_FORWARD ) then
+              absorb_ymin_outer_core(i,k,ispec2D) = weight*sn
+            endif
 
-          if( SAVE_FORWARD ) then
-            absorb_ymin_outer_core(i,k,ispec2D) = weight*sn
-          endif
-
+          enddo
         enddo
       enddo
-    enddo
+    else
+      ! on GPU
+      call compute_stacey_acoustic_cuda(Mesh_pointer,absorb_ymin_outer_core,6) ! <= ymin
+    endif
 
-  else
-    ! on GPU
-    if( nspec2D_ymin_outer_core > 0 ) call compute_stacey_acoustic_cuda(Mesh_pointer, &
-                                                              absorb_ymin_outer_core, &
-                                                              6) ! <= ymin
+    if( SAVE_FORWARD ) then
+      call write_abs(6,absorb_ymin_outer_core,reclen_ymin_outer_core,it)
+    endif
   endif
 
-  if( SAVE_FORWARD .and. nspec2D_ymin_outer_core > 0 ) then
-    call write_abs(6,absorb_ymin_outer_core,reclen_ymin_outer_core,it)
-  endif
-
   !   ymax
+  if( nspec2D_ymax_outer_core > 0 ) then
+    ! adds boundary contributions
+    if( .NOT. GPU_MODE ) then
+      ! on CPU
+      do ispec2D=1,nspec2D_ymax_outer_core
 
-  if( .NOT. GPU_MODE ) then
-    ! on CPU
-    do ispec2D=1,nspec2D_ymax_outer_core
+        ispec=ibelm_ymax_outer_core(ispec2D)
 
-      ispec=ibelm_ymax_outer_core(ispec2D)
+        ! exclude elements that are not on absorbing edges
+        if(nkmin_eta_outer_core(2,ispec2D) == 0 .or. nimin_outer_core(2,ispec2D) == 0) cycle
 
-      ! exclude elements that are not on absorbing edges
-      if(nkmin_eta_outer_core(2,ispec2D) == 0 .or. nimin_outer_core(2,ispec2D) == 0) cycle
+        j=NGLLY
+        do k=nkmin_eta_outer_core(2,ispec2D),NGLLZ
+          do i=nimin_outer_core(2,ispec2D),nimax_outer_core(2,ispec2D)
+            iglob=ibool_outer_core(i,j,k,ispec)
 
-      j=NGLLY
-      do k=nkmin_eta_outer_core(2,ispec2D),NGLLZ
-        do i=nimin_outer_core(2,ispec2D),nimax_outer_core(2,ispec2D)
-          iglob=ibool_outer_core(i,j,k,ispec)
+            sn = veloc_outer_core(iglob)/vp_outer_core(i,j,k,ispec)
 
-          sn = veloc_outer_core(iglob)/vp_outer_core(i,j,k,ispec)
+            weight=jacobian2D_ymax_outer_core(i,k,ispec2D)*wgllwgll_xz(i,k)
 
-          weight=jacobian2D_ymax_outer_core(i,k,ispec2D)*wgllwgll_xz(i,k)
+            accel_outer_core(iglob) = accel_outer_core(iglob) - weight*sn
 
-          accel_outer_core(iglob) = accel_outer_core(iglob) - weight*sn
+            if( SAVE_FORWARD ) then
+              absorb_ymax_outer_core(i,k,ispec2D) = weight*sn
+            endif
 
-          if( SAVE_FORWARD ) then
-            absorb_ymax_outer_core(i,k,ispec2D) = weight*sn
-          endif
-
+          enddo
         enddo
       enddo
-    enddo
+    else
+      ! on GPU
+      call compute_stacey_acoustic_cuda(Mesh_pointer,absorb_ymax_outer_core,7) ! <= ymax
+    endif
 
-  else
-    ! on GPU
-    if( nspec2D_ymax_outer_core > 0 ) call compute_stacey_acoustic_cuda(Mesh_pointer, &
-                                                              absorb_ymax_outer_core, &
-                                                              7) ! <= ymax
+    if( SAVE_FORWARD ) then
+      call write_abs(7,absorb_ymax_outer_core,reclen_ymax_outer_core,it)
+    endif
   endif
 
-  if( SAVE_FORWARD .and. nspec2D_ymax_outer_core > 0 ) then
-    call write_abs(7,absorb_ymax_outer_core,reclen_ymax_outer_core,it)
-  endif
-
   ! zmin
 
   ! for surface elements exactly on the ICB
+  if( nspec2D_zmin_outer_core > 0 ) then
+    ! adds boundary contributions
+    if( .NOT. GPU_MODE ) then
+      ! on CPU
+      do ispec2D = 1,nspec2D_zmin_outer_core
 
-  if( .NOT. GPU_MODE ) then
-    ! on CPU
-    do ispec2D = 1,nspec2D_zmin_outer_core
+        ispec = ibelm_bottom_outer_core(ispec2D)
 
-      ispec = ibelm_bottom_outer_core(ispec2D)
+        k = 1
+        do j = 1,NGLLY
+          do i = 1,NGLLX
+            iglob = ibool_outer_core(i,j,k,ispec)
 
-      k = 1
-      do j = 1,NGLLY
-        do i = 1,NGLLX
-          iglob = ibool_outer_core(i,j,k,ispec)
+            sn = veloc_outer_core(iglob)/vp_outer_core(i,j,k,ispec)
 
-          sn = veloc_outer_core(iglob)/vp_outer_core(i,j,k,ispec)
+            weight = jacobian2D_bottom_outer_core(i,j,ispec2D)*wgllwgll_xy(i,j)
 
-          weight = jacobian2D_bottom_outer_core(i,j,ispec2D)*wgllwgll_xy(i,j)
+            accel_outer_core(iglob) = accel_outer_core(iglob) - weight*sn
 
-          accel_outer_core(iglob) = accel_outer_core(iglob) - weight*sn
+            if( SAVE_FORWARD ) then
+              absorb_zmin_outer_core(i,j,ispec2D) = weight*sn
+            endif
 
-          if( SAVE_FORWARD ) then
-            absorb_zmin_outer_core(i,j,ispec2D) = weight*sn
-          endif
-
+          enddo
         enddo
       enddo
-    enddo
+    else
+      ! on GPU
+      call compute_stacey_acoustic_cuda(Mesh_pointer,absorb_zmin_outer_core,8) ! <= zmin
+    endif
 
-  else
-    ! on GPU
-    if( nspec2D_zmin_outer_core > 0 ) call compute_stacey_acoustic_cuda(Mesh_pointer, &
-                                                              absorb_zmin_outer_core, &
-                                                              8) ! <= zmin
+    if( SAVE_FORWARD ) then
+      call write_abs(8,absorb_zmin_outer_core,reclen_zmin,it)
+    endif
   endif
 
-  if( SAVE_FORWARD .and. nspec2D_zmin_outer_core > 0 ) then
-    call write_abs(8,absorb_zmin_outer_core,reclen_zmin,it)
-  endif
-
   end subroutine compute_stacey_outer_core_forward
 
 
@@ -333,176 +329,165 @@
   !           simple approach is still fastest. (assuming that files are accessed on a local scratch disk)
 
   ! checks
-  if (SIMULATION_TYPE /= 3 ) return
+  if( SIMULATION_TYPE /= 3 ) return
 
   ! outer core
 
   !   xmin
   ! if two chunks exclude this face for one of them
-  if(NCHUNKS_VAL == 1 .or. ichunk == CHUNK_AC) then
-
+  if( NCHUNKS_VAL == 1 .or. ichunk == CHUNK_AC ) then
     ! reads absorbing boundary values
     if( nspec2D_xmin_outer_core > 0 ) then
       ! note: backward/reconstructed wavefields are read in after the Newmark time scheme in the first time loop
       !          this leads to a corresponding boundary condition at time index NSTEP - (it-1) = NSTEP - it + 1
       call read_abs(4,absorb_xmin_outer_core,reclen_xmin_outer_core,NSTEP-it+1)
-    endif
 
-    if( .NOT. GPU_MODE) then
-      ! on CPU
-      do ispec2D=1,nspec2D_xmin_outer_core
+      ! adds boundary contributions
+      if( .NOT. GPU_MODE) then
+        ! on CPU
+        do ispec2D=1,nspec2D_xmin_outer_core
 
-        ispec=ibelm_xmin_outer_core(ispec2D)
+          ispec=ibelm_xmin_outer_core(ispec2D)
 
-        ! exclude elements that are not on absorbing edges
-        if(nkmin_xi_outer_core(1,ispec2D) == 0 .or. njmin_outer_core(1,ispec2D) == 0) cycle
+          ! exclude elements that are not on absorbing edges
+          if(nkmin_xi_outer_core(1,ispec2D) == 0 .or. njmin_outer_core(1,ispec2D) == 0) cycle
 
-        i=1
-        do k=nkmin_xi_outer_core(1,ispec2D),NGLLZ
-          do j=njmin_outer_core(1,ispec2D),njmax_outer_core(1,ispec2D)
-            iglob=ibool_outer_core(i,j,k,ispec)
+          i=1
+          do k=nkmin_xi_outer_core(1,ispec2D),NGLLZ
+            do j=njmin_outer_core(1,ispec2D),njmax_outer_core(1,ispec2D)
+              iglob=ibool_outer_core(i,j,k,ispec)
 
-            b_accel_outer_core(iglob) = b_accel_outer_core(iglob) - absorb_xmin_outer_core(j,k,ispec2D)
+              b_accel_outer_core(iglob) = b_accel_outer_core(iglob) - absorb_xmin_outer_core(j,k,ispec2D)
+            enddo
           enddo
         enddo
-      enddo
-
-    else
-      ! on GPU
-      if( nspec2D_xmin_outer_core > 0 ) call compute_stacey_acoustic_backward_cuda(Mesh_pointer, &
-                                                                absorb_xmin_outer_core, &
-                                                                4) ! <= xmin
+      else
+        ! on GPU
+        call compute_stacey_acoustic_backward_cuda(Mesh_pointer,absorb_xmin_outer_core,4) ! <= xmin
+      endif
     endif
-
   endif
 
   !   xmax
   ! if two chunks exclude this face for one of them
-  if(NCHUNKS_VAL == 1 .or. ichunk == CHUNK_AB) then
-
+  if( NCHUNKS_VAL == 1 .or. ichunk == CHUNK_AB ) then
     if( nspec2D_xmax_outer_core > 0 ) then
+      ! reads in boundary contributions from file
       call read_abs(5,absorb_xmax_outer_core,reclen_xmax_outer_core,NSTEP-it+1)
-    endif
 
-    if( .NOT. GPU_MODE ) then
-      ! on CPU
-      do ispec2D=1,nspec2D_xmax_outer_core
+      ! adds boundary contributions
+      if( .NOT. GPU_MODE ) then
+        ! on CPU
+        do ispec2D=1,nspec2D_xmax_outer_core
 
-        ispec=ibelm_xmax_outer_core(ispec2D)
+          ispec=ibelm_xmax_outer_core(ispec2D)
 
-        ! exclude elements that are not on absorbing edges
-        if(nkmin_xi_outer_core(2,ispec2D) == 0 .or. njmin_outer_core(2,ispec2D) == 0) cycle
+          ! exclude elements that are not on absorbing edges
+          if(nkmin_xi_outer_core(2,ispec2D) == 0 .or. njmin_outer_core(2,ispec2D) == 0) cycle
 
-        i=NGLLX
-        do k=nkmin_xi_outer_core(2,ispec2D),NGLLZ
-          do j=njmin_outer_core(2,ispec2D),njmax_outer_core(2,ispec2D)
-            iglob=ibool_outer_core(i,j,k,ispec)
+          i=NGLLX
+          do k=nkmin_xi_outer_core(2,ispec2D),NGLLZ
+            do j=njmin_outer_core(2,ispec2D),njmax_outer_core(2,ispec2D)
+              iglob=ibool_outer_core(i,j,k,ispec)
 
-            b_accel_outer_core(iglob) = b_accel_outer_core(iglob) - absorb_xmax_outer_core(j,k,ispec2D)
+              b_accel_outer_core(iglob) = b_accel_outer_core(iglob) - absorb_xmax_outer_core(j,k,ispec2D)
+            enddo
           enddo
         enddo
-      enddo
-
-    else
-      ! on GPU
-      if( nspec2D_xmax_outer_core > 0 ) call compute_stacey_acoustic_backward_cuda(Mesh_pointer, &
-                                                                absorb_xmax_outer_core, &
-                                                                5) ! <= xmax
+      else
+        ! on GPU
+        call compute_stacey_acoustic_backward_cuda(Mesh_pointer,absorb_xmax_outer_core,5) ! <= xmax
+      endif
     endif
-
   endif
 
   !   ymin
   if( nspec2D_ymin_outer_core > 0 ) then
+    ! reads in boundary contributions from file
     call read_abs(6,absorb_ymin_outer_core,reclen_ymin_outer_core,NSTEP-it+1)
-  endif
 
-  if( .NOT. GPU_MODE ) then
-    ! on CPU
-    do ispec2D=1,nspec2D_ymin_outer_core
+    ! adds boundary contributions
+    if( .NOT. GPU_MODE ) then
+      ! on CPU
+      do ispec2D=1,nspec2D_ymin_outer_core
 
-      ispec=ibelm_ymin_outer_core(ispec2D)
+        ispec=ibelm_ymin_outer_core(ispec2D)
 
-      ! exclude elements that are not on absorbing edges
-      if(nkmin_eta_outer_core(1,ispec2D) == 0 .or. nimin_outer_core(1,ispec2D) == 0) cycle
+        ! exclude elements that are not on absorbing edges
+        if(nkmin_eta_outer_core(1,ispec2D) == 0 .or. nimin_outer_core(1,ispec2D) == 0) cycle
 
-      j=1
-      do k=nkmin_eta_outer_core(1,ispec2D),NGLLZ
-        do i=nimin_outer_core(1,ispec2D),nimax_outer_core(1,ispec2D)
-          iglob=ibool_outer_core(i,j,k,ispec)
+        j=1
+        do k=nkmin_eta_outer_core(1,ispec2D),NGLLZ
+          do i=nimin_outer_core(1,ispec2D),nimax_outer_core(1,ispec2D)
+            iglob=ibool_outer_core(i,j,k,ispec)
 
-          b_accel_outer_core(iglob) = b_accel_outer_core(iglob) - absorb_ymin_outer_core(i,k,ispec2D)
+            b_accel_outer_core(iglob) = b_accel_outer_core(iglob) - absorb_ymin_outer_core(i,k,ispec2D)
+          enddo
         enddo
       enddo
-    enddo
-
-  else
-    ! on GPU
-    if( nspec2D_ymin_outer_core > 0 ) call compute_stacey_acoustic_backward_cuda(Mesh_pointer, &
-                                                              absorb_ymin_outer_core, &
-                                                              6) ! <= ymin
+    else
+      ! on GPU
+      call compute_stacey_acoustic_backward_cuda(Mesh_pointer,absorb_ymin_outer_core,6) ! <= ymin
+    endif
   endif
 
   !   ymax
-
   if(  nspec2D_ymax_outer_core > 0 ) then
+    ! reads in boundary contributions from file
     call read_abs(7,absorb_ymax_outer_core,reclen_ymax_outer_core,NSTEP-it+1)
-  endif
 
-  if( .NOT. GPU_MODE ) then
-    ! on CPU
-    do ispec2D=1,nspec2D_ymax_outer_core
+    ! adds boundary contributions
+    if( .NOT. GPU_MODE ) then
+      ! on CPU
+      do ispec2D=1,nspec2D_ymax_outer_core
 
-      ispec=ibelm_ymax_outer_core(ispec2D)
+        ispec=ibelm_ymax_outer_core(ispec2D)
 
-      ! exclude elements that are not on absorbing edges
-      if(nkmin_eta_outer_core(2,ispec2D) == 0 .or. nimin_outer_core(2,ispec2D) == 0) cycle
+        ! exclude elements that are not on absorbing edges
+        if(nkmin_eta_outer_core(2,ispec2D) == 0 .or. nimin_outer_core(2,ispec2D) == 0) cycle
 
-      j=NGLLY
-      do k=nkmin_eta_outer_core(2,ispec2D),NGLLZ
-        do i=nimin_outer_core(2,ispec2D),nimax_outer_core(2,ispec2D)
-          iglob=ibool_outer_core(i,j,k,ispec)
+        j=NGLLY
+        do k=nkmin_eta_outer_core(2,ispec2D),NGLLZ
+          do i=nimin_outer_core(2,ispec2D),nimax_outer_core(2,ispec2D)
+            iglob=ibool_outer_core(i,j,k,ispec)
 
-          b_accel_outer_core(iglob) = b_accel_outer_core(iglob) - absorb_ymax_outer_core(i,k,ispec2D)
+            b_accel_outer_core(iglob) = b_accel_outer_core(iglob) - absorb_ymax_outer_core(i,k,ispec2D)
+          enddo
         enddo
       enddo
-    enddo
-
-  else
-    ! on GPU
-    if( nspec2D_ymax_outer_core > 0 ) call compute_stacey_acoustic_backward_cuda(Mesh_pointer, &
-                                                              absorb_ymax_outer_core, &
-                                                              7) ! <= ymax
+    else
+      ! on GPU
+      call compute_stacey_acoustic_backward_cuda(Mesh_pointer,absorb_ymax_outer_core,7) ! <= ymax
+    endif
   endif
 
   ! zmin
 
   ! for surface elements exactly on the ICB
   if( nspec2D_zmin_outer_core > 0 ) then
+    ! reads in boundary contributions from file
     call read_abs(8,absorb_zmin_outer_core,reclen_zmin,NSTEP-it+1)
-  endif
 
-  if( .NOT. GPU_MODE ) then
-    ! on CPU
-    do ispec2D = 1,nspec2D_zmin_outer_core
+    ! adds boundary contributions
+    if( .NOT. GPU_MODE ) then
+      ! on CPU
+      do ispec2D = 1,nspec2D_zmin_outer_core
 
-      ispec = ibelm_bottom_outer_core(ispec2D)
+        ispec = ibelm_bottom_outer_core(ispec2D)
 
-      k = 1
-      do j = 1,NGLLY
-        do i = 1,NGLLX
-          iglob = ibool_outer_core(i,j,k,ispec)
+        k = 1
+        do j = 1,NGLLY
+          do i = 1,NGLLX
+            iglob = ibool_outer_core(i,j,k,ispec)
 
-          b_accel_outer_core(iglob) = b_accel_outer_core(iglob) - absorb_zmin_outer_core(i,j,ispec2D)
+            b_accel_outer_core(iglob) = b_accel_outer_core(iglob) - absorb_zmin_outer_core(i,j,ispec2D)
+          enddo
         enddo
       enddo
-    enddo
-
-  else
-    ! on GPU
-    if( nspec2D_zmin_outer_core > 0 ) call compute_stacey_acoustic_backward_cuda(Mesh_pointer, &
-                                                              absorb_zmin_outer_core, &
-                                                              8) ! <= zmin
+    else
+      ! on GPU
+      call compute_stacey_acoustic_backward_cuda(Mesh_pointer,absorb_zmin_outer_core,8) ! <= zmin
+    endif
   endif
 
   end subroutine compute_stacey_outer_core_backward
@@ -549,164 +534,167 @@
 
   !   xmin
   ! if two chunks exclude this face for one of them
-  if(NCHUNKS_VAL == 1 .or. ichunk == CHUNK_AC) then
+  if( NCHUNKS_VAL == 1 .or. ichunk == CHUNK_AC ) then
+    if( nspec2D_xmin_outer_core > 0 ) then
+      ! adds boundary contribution
+      if( .NOT. GPU_MODE) then
+        ! on CPU
+        do ispec2D=1,nspec2D_xmin_outer_core
 
-    if( .NOT. GPU_MODE) then
-      ! on CPU
-      do ispec2D=1,nspec2D_xmin_outer_core
+          ispec=ibelm_xmin_outer_core(ispec2D)
 
-        ispec=ibelm_xmin_outer_core(ispec2D)
+          ! exclude elements that are not on absorbing edges
+          if(nkmin_xi_outer_core(1,ispec2D) == 0 .or. njmin_outer_core(1,ispec2D) == 0) cycle
 
-        ! exclude elements that are not on absorbing edges
-        if(nkmin_xi_outer_core(1,ispec2D) == 0 .or. njmin_outer_core(1,ispec2D) == 0) cycle
+          i=1
+          do k=nkmin_xi_outer_core(1,ispec2D),NGLLZ
+            do j=njmin_outer_core(1,ispec2D),njmax_outer_core(1,ispec2D)
+              iglob=ibool_outer_core(i,j,k,ispec)
 
-        i=1
-        do k=nkmin_xi_outer_core(1,ispec2D),NGLLZ
-          do j=njmin_outer_core(1,ispec2D),njmax_outer_core(1,ispec2D)
-            iglob=ibool_outer_core(i,j,k,ispec)
+              sn = b_veloc_outer_core(iglob)/vp_outer_core(i,j,k,ispec)
 
-            sn = b_veloc_outer_core(iglob)/vp_outer_core(i,j,k,ispec)
+              weight = jacobian2D_xmin_outer_core(j,k,ispec2D)*wgllwgll_yz(j,k)
 
-            weight = jacobian2D_xmin_outer_core(j,k,ispec2D)*wgllwgll_yz(j,k)
+              b_accel_outer_core(iglob) = b_accel_outer_core(iglob) - weight*sn
 
-            b_accel_outer_core(iglob) = b_accel_outer_core(iglob) - weight*sn
-
+            enddo
           enddo
         enddo
-      enddo
-
-    else
-      ! on GPU
-      if( nspec2D_xmin_outer_core > 0 ) call compute_stacey_acoustic_undoatt_cuda(Mesh_pointer,4) ! <= xmin
+      else
+        ! on GPU
+        call compute_stacey_acoustic_undoatt_cuda(Mesh_pointer,4) ! <= xmin
+      endif
     endif
-
   endif
 
   !   xmax
   ! if two chunks exclude this face for one of them
-  if(NCHUNKS_VAL == 1 .or. ichunk == CHUNK_AB) then
+  if( NCHUNKS_VAL == 1 .or. ichunk == CHUNK_AB ) then
+    if( nspec2D_xmax_outer_core > 0 ) then
+      ! adds boundary contribution
+      if( .NOT. GPU_MODE ) then
+        ! on CPU
+        do ispec2D=1,nspec2D_xmax_outer_core
 
-    if( .NOT. GPU_MODE ) then
-      ! on CPU
-      do ispec2D=1,nspec2D_xmax_outer_core
+          ispec=ibelm_xmax_outer_core(ispec2D)
 
-        ispec=ibelm_xmax_outer_core(ispec2D)
+          ! exclude elements that are not on absorbing edges
+          if(nkmin_xi_outer_core(2,ispec2D) == 0 .or. njmin_outer_core(2,ispec2D) == 0) cycle
 
-        ! exclude elements that are not on absorbing edges
-        if(nkmin_xi_outer_core(2,ispec2D) == 0 .or. njmin_outer_core(2,ispec2D) == 0) cycle
+          i=NGLLX
+          do k=nkmin_xi_outer_core(2,ispec2D),NGLLZ
+            do j=njmin_outer_core(2,ispec2D),njmax_outer_core(2,ispec2D)
+              iglob=ibool_outer_core(i,j,k,ispec)
 
-        i=NGLLX
-        do k=nkmin_xi_outer_core(2,ispec2D),NGLLZ
-          do j=njmin_outer_core(2,ispec2D),njmax_outer_core(2,ispec2D)
-            iglob=ibool_outer_core(i,j,k,ispec)
+              sn = b_veloc_outer_core(iglob)/vp_outer_core(i,j,k,ispec)
 
-            sn = b_veloc_outer_core(iglob)/vp_outer_core(i,j,k,ispec)
+              weight = jacobian2D_xmax_outer_core(j,k,ispec2D)*wgllwgll_yz(j,k)
 
-            weight = jacobian2D_xmax_outer_core(j,k,ispec2D)*wgllwgll_yz(j,k)
+              b_accel_outer_core(iglob) = b_accel_outer_core(iglob) - weight*sn
 
-            b_accel_outer_core(iglob) = b_accel_outer_core(iglob) - weight*sn
-
+            enddo
           enddo
         enddo
-      enddo
-
-    else
-      ! on GPU
-      if( nspec2D_xmax_outer_core > 0 ) call compute_stacey_acoustic_undoatt_cuda(Mesh_pointer,5) ! <= xmax
+      else
+        ! on GPU
+        call compute_stacey_acoustic_undoatt_cuda(Mesh_pointer,5) ! <= xmax
+      endif
     endif
-
   endif
 
   !   ymin
+  if( nspec2D_ymin_outer_core > 0 ) then
+    ! adds boundary contribution
+    if( .NOT. GPU_MODE ) then
+      ! on CPU
+      do ispec2D=1,nspec2D_ymin_outer_core
 
-  if( .NOT. GPU_MODE ) then
-    ! on CPU
-    do ispec2D=1,nspec2D_ymin_outer_core
+        ispec=ibelm_ymin_outer_core(ispec2D)
 
-      ispec=ibelm_ymin_outer_core(ispec2D)
+        ! exclude elements that are not on absorbing edges
+        if(nkmin_eta_outer_core(1,ispec2D) == 0 .or. nimin_outer_core(1,ispec2D) == 0) cycle
 
-      ! exclude elements that are not on absorbing edges
-      if(nkmin_eta_outer_core(1,ispec2D) == 0 .or. nimin_outer_core(1,ispec2D) == 0) cycle
+        j=1
+        do k=nkmin_eta_outer_core(1,ispec2D),NGLLZ
+          do i=nimin_outer_core(1,ispec2D),nimax_outer_core(1,ispec2D)
+            iglob=ibool_outer_core(i,j,k,ispec)
 
-      j=1
-      do k=nkmin_eta_outer_core(1,ispec2D),NGLLZ
-        do i=nimin_outer_core(1,ispec2D),nimax_outer_core(1,ispec2D)
-          iglob=ibool_outer_core(i,j,k,ispec)
+            sn = b_veloc_outer_core(iglob)/vp_outer_core(i,j,k,ispec)
 
-          sn = b_veloc_outer_core(iglob)/vp_outer_core(i,j,k,ispec)
+            weight=jacobian2D_ymin_outer_core(i,k,ispec2D)*wgllwgll_xz(i,k)
 
-          weight=jacobian2D_ymin_outer_core(i,k,ispec2D)*wgllwgll_xz(i,k)
+            b_accel_outer_core(iglob) = b_accel_outer_core(iglob) - weight*sn
 
-          b_accel_outer_core(iglob) = b_accel_outer_core(iglob) - weight*sn
-
+          enddo
         enddo
       enddo
-    enddo
-
-  else
-    ! on GPU
-    if( nspec2D_ymin_outer_core > 0 ) call compute_stacey_acoustic_undoatt_cuda(Mesh_pointer,6) ! <= ymin
+    else
+      ! on GPU
+      call compute_stacey_acoustic_undoatt_cuda(Mesh_pointer,6) ! <= ymin
+    endif
   endif
 
   !   ymax
+  if( nspec2D_ymax_outer_core > 0 ) then
+    ! adds boundary contribution
+    if( .NOT. GPU_MODE ) then
+      ! on CPU
+      do ispec2D=1,nspec2D_ymax_outer_core
 
-  if( .NOT. GPU_MODE ) then
-    ! on CPU
-    do ispec2D=1,nspec2D_ymax_outer_core
+        ispec=ibelm_ymax_outer_core(ispec2D)
 
-      ispec=ibelm_ymax_outer_core(ispec2D)
+        ! exclude elements that are not on absorbing edges
+        if(nkmin_eta_outer_core(2,ispec2D) == 0 .or. nimin_outer_core(2,ispec2D) == 0) cycle
 
-      ! exclude elements that are not on absorbing edges
-      if(nkmin_eta_outer_core(2,ispec2D) == 0 .or. nimin_outer_core(2,ispec2D) == 0) cycle
+        j=NGLLY
+        do k=nkmin_eta_outer_core(2,ispec2D),NGLLZ
+          do i=nimin_outer_core(2,ispec2D),nimax_outer_core(2,ispec2D)
+            iglob=ibool_outer_core(i,j,k,ispec)
 
-      j=NGLLY
-      do k=nkmin_eta_outer_core(2,ispec2D),NGLLZ
-        do i=nimin_outer_core(2,ispec2D),nimax_outer_core(2,ispec2D)
-          iglob=ibool_outer_core(i,j,k,ispec)
+            sn = b_veloc_outer_core(iglob)/vp_outer_core(i,j,k,ispec)
 
-          sn = b_veloc_outer_core(iglob)/vp_outer_core(i,j,k,ispec)
+            weight=jacobian2D_ymax_outer_core(i,k,ispec2D)*wgllwgll_xz(i,k)
 
-          weight=jacobian2D_ymax_outer_core(i,k,ispec2D)*wgllwgll_xz(i,k)
+            b_accel_outer_core(iglob) = b_accel_outer_core(iglob) - weight*sn
 
-          b_accel_outer_core(iglob) = b_accel_outer_core(iglob) - weight*sn
-
+          enddo
         enddo
       enddo
-    enddo
-
-  else
-    ! on GPU
-    if( nspec2D_ymax_outer_core > 0 ) call compute_stacey_acoustic_undoatt_cuda(Mesh_pointer,7) ! <= ymax
+    else
+      ! on GPU
+      call compute_stacey_acoustic_undoatt_cuda(Mesh_pointer,7) ! <= ymax
+    endif
   endif
 
   ! zmin
 
   ! for surface elements exactly on the ICB
+  if( nspec2D_zmin_outer_core > 0 ) then
+    ! adds boundary contribution
+    if( .NOT. GPU_MODE ) then
+      ! on CPU
+      do ispec2D = 1,nspec2D_zmin_outer_core
 
-  if( .NOT. GPU_MODE ) then
-    ! on CPU
-    do ispec2D = 1,nspec2D_zmin_outer_core
+        ispec = ibelm_bottom_outer_core(ispec2D)
 
-      ispec = ibelm_bottom_outer_core(ispec2D)
+        k = 1
+        do j = 1,NGLLY
+          do i = 1,NGLLX
+            iglob = ibool_outer_core(i,j,k,ispec)
 
-      k = 1
-      do j = 1,NGLLY
-        do i = 1,NGLLX
-          iglob = ibool_outer_core(i,j,k,ispec)
+            sn = b_veloc_outer_core(iglob)/vp_outer_core(i,j,k,ispec)
 
-          sn = b_veloc_outer_core(iglob)/vp_outer_core(i,j,k,ispec)
+            weight = jacobian2D_bottom_outer_core(i,j,ispec2D)*wgllwgll_xy(i,j)
 
-          weight = jacobian2D_bottom_outer_core(i,j,ispec2D)*wgllwgll_xy(i,j)
+            b_accel_outer_core(iglob) = b_accel_outer_core(iglob) - weight*sn
 
-          b_accel_outer_core(iglob) = b_accel_outer_core(iglob) - weight*sn
-
+          enddo
         enddo
       enddo
-    enddo
-
-  else
-    ! on GPU
-    if( nspec2D_zmin_outer_core > 0 ) call compute_stacey_acoustic_undoatt_cuda(Mesh_pointer,8) ! <= zmin
+    else
+      ! on GPU
+      call compute_stacey_acoustic_undoatt_cuda(Mesh_pointer,8) ! <= zmin
+    endif
   endif
 
   end subroutine compute_stacey_outer_core_backward_undoatt

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/get_attenuation.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/get_attenuation.f90	2014-01-06 21:22:48 UTC (rev 22990)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/get_attenuation.f90	2014-01-13 16:06:21 UTC (rev 22991)
@@ -171,7 +171,7 @@
 
   subroutine get_attenuation_scale_factor(myrank, T_c_source, tau_mu, tau_sigma, Q_mu, scale_factor)
 
-  use constants
+  use constants,only: ZERO,ONE,TWO,PI,GRAV,RHOAV,TWO_PI,N_SLS
 
   implicit none
 
@@ -227,8 +227,8 @@
   scale_factor = factor_scale_mu * factor_scale_mu0
 
   !--- check that the correction factor is close to one
-  if(scale_factor < 0.8 .or. scale_factor > 1.2) then
-     write(*,*)'scale factor: ', scale_factor
+  if(scale_factor < 0.8d0 .or. scale_factor > 1.2d0) then
+     print*,'error: incorrect scale factor: ', scale_factor
      call exit_MPI(myrank,'incorrect correction factor in attenuation model')
   endif
 

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/locate_receivers.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/locate_receivers.f90	2014-01-06 21:22:48 UTC (rev 22990)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/locate_receivers.f90	2014-01-13 16:06:21 UTC (rev 22991)
@@ -762,7 +762,7 @@
         epidist_found(nrec_found) = epidist(irec)
 
         ! writes out actual receiver location to vtk file
-        write(IOVTK,*) sngl(x_found(irec)), sngl(y_found(irec)), sngl(z_found(irec))
+        write(IOVTK,'(3e18.6)') sngl(x_found(irec)), sngl(y_found(irec)), sngl(z_found(irec))
       endif
     enddo
 

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/locate_sources.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/locate_sources.f90	2014-01-06 21:22:48 UTC (rev 22990)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/locate_sources.f90	2014-01-13 16:06:21 UTC (rev 22991)
@@ -669,9 +669,9 @@
         write(IMAIN,*) '    time shift: ',tshift_cmt(isource),' seconds'
 
         ! writes out actual source position to vtk file
-        write(IOVTK,*) sngl(x_found_source(isource_in_this_subset)), &
-                      sngl(y_found_source(isource_in_this_subset)), &
-                      sngl(z_found_source(isource_in_this_subset))
+        write(IOVTK,'(3e18.6)') sngl(x_found_source(isource_in_this_subset)), &
+                                sngl(y_found_source(isource_in_this_subset)), &
+                                sngl(z_found_source(isource_in_this_subset))
 
         ! get latitude, longitude and depth of the source that will be used
         call xyz_2_rthetaphi_dble(x_found_source(isource_in_this_subset), &

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/setup_sources_receivers.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/setup_sources_receivers.f90	2014-01-06 21:22:48 UTC (rev 22990)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/setup_sources_receivers.f90	2014-01-13 16:06:21 UTC (rev 22991)
@@ -311,6 +311,12 @@
   !if (OUTPUT_SEISMOS_SAC_ALPHANUM .and. (mod(NTSTEP_BETWEEN_OUTPUT_SEISMOS,5)/=0)) &
   !  stop 'if OUTPUT_SEISMOS_SAC_ALPHANUM = .true. then NTSTEP_BETWEEN_OUTPUT_SEISMOS must be a multiple of 5, check the Par_file'
 
+  ! subsets used to save adjoint sources must not be larger than the whole time series,
+  ! otherwise we waste memory
+  if( SIMULATION_TYPE == 2 .or. SIMULATION_TYPE == 3 ) then
+    if(NTSTEP_BETWEEN_READ_ADJSRC > NSTEP) NTSTEP_BETWEEN_READ_ADJSRC = NSTEP
+  endif
+
   end subroutine setup_timesteps
 
 !
@@ -332,6 +338,9 @@
   character(len=256) :: filename,adj_source_file
   character(len=2) :: bic
   integer :: ier
+  integer,dimension(:),allocatable :: tmp_rec_local_all
+  integer :: maxrec,maxproc(1)
+  double precision :: size
 
   ! user output
   if( myrank == 0 ) then
@@ -430,9 +439,12 @@
               read(IIN,*,iostat=ier) junk,junk
               if( ier == 0 ) itime = itime + 1
             enddo
-            if( itime /= NSTEP) &
+            ! checks length
+            if( itime /= NSTEP) then
+              print*,'adjoint source error: ',trim(filename),' has length',itime,' but should be',NSTEP
               call exit_MPI(myrank,&
                 'file '//trim(filename)//' has wrong length, please check with your simulation duration')
+            endif
 
             ! updates counter for found files
             nadj_files_found = nadj_files_found + 1
@@ -478,6 +490,86 @@
       call exit_MPI(myrank,'total number of receivers is incorrect')
   endif
 
+  ! statistics about allocation memory for seismograms & adj_sourcearrays
+  ! gathers info about receivers on master
+  if( myrank == 0 ) then
+    ! only master process needs full arrays allocated
+    allocate(tmp_rec_local_all(NPROCTOT_VAL),stat=ier)
+    if( ier /= 0 ) call exit_MPI(myrank,'error allocating temporary array tmp_rec_local_all')
+  else
+    ! dummy arrays
+    allocate(tmp_rec_local_all(1),stat=ier)
+    if( ier /= 0 ) call exit_MPI(myrank,'error allocating temporary array tmp_rec_local_all')
+  endif
+
+  ! seismograms
+  ! gather from slaves on master
+  tmp_rec_local_all(:) = 0
+  tmp_rec_local_all(1) = nrec_local
+  if( NPROCTOT_VAL > 1 ) then
+    call gather_all_singlei(nrec_local,tmp_rec_local_all,NPROCTOT_VAL)
+  endif
+  ! user output
+  if( myrank == 0 ) then
+    ! determines maximum number of local receivers and corresponding rank
+    maxrec = maxval(tmp_rec_local_all(:))
+    maxproc = maxloc(tmp_rec_local_all(:))
+    ! seismograms array size in MB
+    if( SIMULATION_TYPE == 1 .or. SIMULATION_TYPE == 3 ) then
+      ! seismograms need seismograms(NDIM,nrec_local,NTSTEP_BETWEEN_OUTPUT_SEISMOS)
+      size = dble(maxrec) * dble(NDIM * NTSTEP_BETWEEN_OUTPUT_SEISMOS * CUSTOM_REAL / 1024. / 1024. )
+    else
+      ! adjoint seismograms need seismograms(NDIM*NDIM,nrec_local,NTSTEP_BETWEEN_OUTPUT_SEISMOS)
+      size = dble(maxrec) * dble(NDIM * NDIM * NTSTEP_BETWEEN_OUTPUT_SEISMOS * CUSTOM_REAL / 1024. / 1024. )
+    endif
+    ! outputs info
+    write(IMAIN,*)
+    write(IMAIN,*) 'seismograms:'
+    write(IMAIN,*) '  writing out seismograms at every NTSTEP_BETWEEN_OUTPUT_SEISMOS = ',NTSTEP_BETWEEN_OUTPUT_SEISMOS
+    write(IMAIN,*) '  maximum number of local receivers is ',maxrec,' in slice ',maxproc(1)
+    write(IMAIN,*) '  size of maximum seismogram array = ', sngl(size),'MB'
+    write(IMAIN,*) '                                   = ', sngl(size/1024.d0),'GB'
+    write(IMAIN,*)
+    call flush_IMAIN()
+  endif
+
+  ! adjoint sources
+  if( SIMULATION_TYPE == 2 .or. SIMULATION_TYPE == 3 ) then
+    ! gather from slaves on master
+    tmp_rec_local_all(:) = 0
+    tmp_rec_local_all(1) = nadj_rec_local
+    if( NPROCTOT_VAL > 1 ) then
+      call gather_all_singlei(nadj_rec_local,tmp_rec_local_all,NPROCTOT_VAL)
+    endif
+    ! user output
+    if( myrank == 0 ) then
+      ! determines maximum number of local receivers and corresponding rank
+      maxrec = maxval(tmp_rec_local_all(:))
+      maxproc = maxloc(tmp_rec_local_all(:))
+      !do i = 1, NPROCTOT_VAL
+      !  if( tmp_rec_local_all(i) > maxrec ) then
+      !    maxrec = tmp_rec_local_all(i)
+      !    maxproc = i-1
+      !  endif
+      !enddo
+      ! adj_sourcearrays size in MB
+      ! adj_sourcearrays(NDIM,NGLLX,NGLLY,NGLLZ,nadj_rec_local,NTSTEP_BETWEEN_READ_ADJSRC)
+      size = dble(maxrec) * dble(NDIM * NGLLX * NGLLY * NGLLZ * NTSTEP_BETWEEN_READ_ADJSRC * CUSTOM_REAL / 1024. / 1024. )
+
+      ! outputs info
+      write(IMAIN,*) 'adjoint source arrays:'
+      write(IMAIN,*) '  reading adjoint sources at every NTSTEP_BETWEEN_READ_ADJSRC = ',NTSTEP_BETWEEN_READ_ADJSRC
+      write(IMAIN,*) '  maximum number of local adjoint sources is ',maxrec,' in slice ',maxproc(1)
+      write(IMAIN,*) '  size of maximum adjoint source array = ', sngl(size),'MB'
+      write(IMAIN,*) '                                       = ', sngl(size/1024.d0),'GB'
+      write(IMAIN,*)
+      call flush_IMAIN()
+    endif
+  endif
+
+  deallocate(tmp_rec_local_all)
+
+
   end subroutine setup_receivers
 
 !
@@ -545,36 +637,38 @@
 
     ! stores source arrays
     call setup_sources_receivers_srcarr()
-
   endif
 
   ! adjoint source arrays
   if (SIMULATION_TYPE == 2 .or. SIMULATION_TYPE == 3) then
-    ! adjoint source buffer length
-    NSTEP_SUB_ADJ = ceiling( dble(NSTEP)/dble(NTSTEP_BETWEEN_READ_ADJSRC) )
+    ! initializes adjoint source buffer
+    ! reverse indexing
     allocate(iadj_vec(NSTEP),stat=ier)
     if( ier /= 0 ) call exit_MPI(myrank,'error allocating iadj_vec')
-
     ! initializes iadj_vec
     do it=1,NSTEP
-       iadj_vec(it) = NSTEP-it+1  ! default is for reversing entire record
+       iadj_vec(it) = NSTEP-it+1  ! default is for reversing entire record, e.g. 3000,2999,..,1
     enddo
 
+    ! number of adjoint source blocks to read in
+    NSTEP_SUB_ADJ = ceiling( dble(NSTEP)/dble(NTSTEP_BETWEEN_READ_ADJSRC) )
+
     if(nadj_rec_local > 0) then
       ! allocate adjoint source arrays
       allocate(adj_sourcearrays(NDIM,NGLLX,NGLLY,NGLLZ,nadj_rec_local,NTSTEP_BETWEEN_READ_ADJSRC), &
-              stat=ier)
+               stat=ier)
       if( ier /= 0 ) call exit_MPI(myrank,'error allocating adjoint sourcearrays')
       adj_sourcearrays(:,:,:,:,:,:) = 0._CUSTOM_REAL
 
       ! allocate indexing arrays
       allocate(iadjsrc(NSTEP_SUB_ADJ,2), &
-              iadjsrc_len(NSTEP_SUB_ADJ),stat=ier)
+               iadjsrc_len(NSTEP_SUB_ADJ),stat=ier)
       if( ier /= 0 ) call exit_MPI(myrank,'error allocating adjoint indexing arrays')
+
       ! initializes iadjsrc, iadjsrc_len and iadj_vec
       call setup_sources_receivers_adjindx(NSTEP,NSTEP_SUB_ADJ, &
-                      NTSTEP_BETWEEN_READ_ADJSRC, &
-                      iadjsrc,iadjsrc_len,iadj_vec)
+                                           NTSTEP_BETWEEN_READ_ADJSRC, &
+                                           iadjsrc,iadjsrc_len,iadj_vec)
     endif
   endif
 
@@ -659,8 +753,8 @@
 !
 
   subroutine setup_sources_receivers_adjindx(NSTEP,NSTEP_SUB_ADJ, &
-                      NTSTEP_BETWEEN_READ_ADJSRC, &
-                      iadjsrc,iadjsrc_len,iadj_vec)
+                                             NTSTEP_BETWEEN_READ_ADJSRC, &
+                                             iadjsrc,iadjsrc_len,iadj_vec)
 
   use constants
 
@@ -674,9 +768,9 @@
 
   ! local parameters
   integer :: iadj_block,it,it_sub_adj
+  integer :: istart,iend
 
-  iadj_block = 1  !counts blocks
-
+  ! initializes
   iadjsrc(:,:) = 0
   iadjsrc_len(:) = 0
 
@@ -690,31 +784,51 @@
   !
   ! see routine: compute_add_sources_adjoint()
   !                     how the adjoint source is added to the (adjoint) acceleration field
+  !counts blocks
+  ! block number
+  ! e.g. increases from 1 (case it=1-1000), 2 (case it=1001-2000) to 3 (case it=2001-3000)
+  it_sub_adj = 0
+  iadj_block = 1
   do it=1,NSTEP
 
     ! block number
     ! e.g. increases from 1 (case it=1-1000), 2 (case it=1001-2000) to 3 (case it=2001-3000)
-    it_sub_adj = ceiling( dble(it)/dble(NTSTEP_BETWEEN_READ_ADJSRC) )
+    ! beware: the call below might return a wrong integer number due to machine precision, i.e. 1000./1000. -> 2
+    !it_sub_adj = ceiling( dble(it)/dble(NTSTEP_BETWEEN_READ_ADJSRC) )
 
     ! we are at the edge of a block
     if(mod(it-1,NTSTEP_BETWEEN_READ_ADJSRC) == 0) then
-     ! block start time ( e.g. 2001)
-     iadjsrc(iadj_block,1) = NSTEP-it_sub_adj*NTSTEP_BETWEEN_READ_ADJSRC+1
-     ! block end time (e.g. 3000)
-     iadjsrc(iadj_block,2) = NSTEP-(it_sub_adj-1)*NTSTEP_BETWEEN_READ_ADJSRC
+      ! sets it_sub_adj subset number
+      it_sub_adj = iadj_block
 
-     ! final adj src array
-     ! e.g. will be from 1000 to 1, but doesn't go below 1 in cases where NSTEP isn't
-     ! a multiple of NTSTEP_BETWEEN_READ_ADJSRC
-     if(iadjsrc(iadj_block,1) < 0) iadjsrc(iadj_block,1) = 1
+      ! block start time ( e.g. 2001)
+      istart = NSTEP-it_sub_adj*NTSTEP_BETWEEN_READ_ADJSRC+1
+      ! final adj src array
+      ! e.g. will be from 1000 to 1, but doesn't go below 1 in cases where NSTEP isn't
+      ! a multiple of NTSTEP_BETWEEN_READ_ADJSRC
+      if( istart < 1 ) istart = 1
 
-     ! actual block length
-     iadjsrc_len(iadj_block) = iadjsrc(iadj_block,2)-iadjsrc(iadj_block,1)+1
+      ! block end time (e.g. 3000)
+      iend = NSTEP-(it_sub_adj-1)*NTSTEP_BETWEEN_READ_ADJSRC
 
-     ! increases block number
-     iadj_block = iadj_block+1
+      iadjsrc(iadj_block,1) = istart
+      iadjsrc(iadj_block,2) = iend
+
+      ! actual block length
+      iadjsrc_len(iadj_block) = iend - istart + 1
+
+      ! increases block number
+      iadj_block = iadj_block + 1
     endif
 
+    ! checks that ceiling function above returns correct integer values
+    if( it_sub_adj /= iadj_block-1 ) then
+      print*,'error: confusing block number for reverse adjoint source indexing'
+      print*,'  it_sub_adj = ',it_sub_adj,'should be equal to ',iadj_block-1
+      print*,'  it = ',it,' istart/iend = ',istart,iend,' NTSTEP_BETWEEN_READ_ADJSRC = ',NTSTEP_BETWEEN_READ_ADJSRC
+      stop 'error reverse adjoint source indexing'
+    endif
+
     ! time stepping for adjoint sources:
     ! adjoint time step that corresponds to time step in simulation (it).
     ! note, that adjoint source has to be time-reversed with respect to the forward wavefield
@@ -724,6 +838,9 @@
     !         so iadj_vec(1001) = 1000 - 0, iadj_vec(1002) = 1000 - 1, .. and so on again down to 1
     !         then block 3 and your guess is right now... iadj_vec(2001) to iadj_vec(3000) is 1000 down to 1. :)
     iadj_vec(it) = iadjsrc_len(it_sub_adj) - mod(it-1,NTSTEP_BETWEEN_READ_ADJSRC)
+
+    ! checks that index is non-negative
+    if( iadj_vec(it) < 1 ) iadj_vec(it) = 1
   enddo
 
   end subroutine setup_sources_receivers_adjindx
@@ -800,7 +917,7 @@
   else
     ! allocates dummy array since we need it to pass as argument e.g. in write_seismograms() routine
     ! note: nrec_local is zero, fortran 90/95 should allow zero-sized array allocation...
-    allocate(seismograms(NDIM,nrec_local,NTSTEP_BETWEEN_OUTPUT_SEISMOS),stat=ier)
+    allocate(seismograms(NDIM,0,NTSTEP_BETWEEN_OUTPUT_SEISMOS),stat=ier)
     if( ier /= 0) stop 'error while allocating zero seismograms'
   endif
 

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/write_seismograms.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/write_seismograms.f90	2014-01-06 21:22:48 UTC (rev 22990)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/write_seismograms.f90	2014-01-13 16:06:21 UTC (rev 22991)
@@ -40,6 +40,7 @@
 
     ! gets resulting array values onto CPU
     if( GPU_MODE ) then
+      ! gets field values from GPU
       ! this transfers fields only in elements with stations for efficiency
       call write_seismograms_transfer_cuda(Mesh_pointer, &
                                 displ_crust_mantle,b_displ_crust_mantle, &
@@ -49,6 +50,15 @@
                                 number_receiver_global, &
                                 ispec_selected_rec,ispec_selected_source, &
                                 ibool_crust_mantle)
+
+      ! synchronizes field values from GPU
+      if( GPU_ASYNC_COPY) then
+        call transfer_seismo_from_device_async(Mesh_pointer, &
+                                               displ_crust_mantle,b_displ_crust_mantle, &
+                                               number_receiver_global,ispec_selected_rec,ispec_selected_source, &
+                                               ibool_crust_mantle)
+      endif
+
     endif
 
     ! computes traces at interpolated receiver locations

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/utils/Visualization/VTK_ParaView/create_slice_VTK.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/utils/Visualization/VTK_ParaView/create_slice_VTK.f90	2014-01-06 21:22:48 UTC (rev 22990)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/utils/Visualization/VTK_ParaView/create_slice_VTK.f90	2014-01-13 16:06:21 UTC (rev 22991)
@@ -313,7 +313,7 @@
 
   ! type: hexahedrons
   write(IOVTK,'(a,i12)') "CELL_TYPES ",nspec
-  write(IOVTK,*) (12,ispec=1,nspec)
+  write(IOVTK,'(6i12)') (12,ispec=1,nspec)
   write(IOVTK,*) ""
 
   ! writes out gll-data (velocity) for each element point