[cig-commits] r22746 - in seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER: setup src/cuda src/shared src/specfem3D

Fri Aug 30 05:45:45 PDT 2013

Author: danielpeter
Date: 2013-08-30 05:45:44 -0700 (Fri, 30 Aug 2013)
New Revision: 22746

Added:
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_outer_core_Dev.F90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/read_mesh_databases.F90
Removed:
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_outer_core_Dev.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/read_mesh_databases.f90
Modified:
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/setup/constants.h.in
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_coupling_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_outer_core_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_stacey_acoustic_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_stacey_elastic_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/it_update_displacement_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/specfem3D_gpu_cuda_method_stubs.c
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/shared/parallel.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/shared/read_parameter_file.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_add_sources.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_boundary_kernel.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_coupling.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_acoustic_calling_routine.F90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_crust_mantle_Dev.F90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_crust_mantle_noDev.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_inner_core_Dev.F90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_inner_core_noDev.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_viscoelastic_calling_routine.F90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_kernels.F90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_stacey_crust_mantle.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_stacey_outer_core.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/finalize_simulation.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/iterate_time.F90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/locate_regular_points.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/locate_sources.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/rules.mk
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/save_regular_kernels.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/setup_GLL_points.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/specfem3D_par.F90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/update_displacement_Newmark.f90
Log:
separates forward and backward routines; renames endings to read_mesh_databases.F90 and compute_forces_outer_core_Dev.F90 to use preprocessor directives; adds regular kernel routines

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/setup/constants.h.in
===================================================================

--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/setup/constants.h.in	2013-08-30 00:30:39 UTC (rev 22745)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/setup/constants.h.in	2013-08-30 12:45:44 UTC (rev 22746)
@@ -277,12 +277,10 @@
 ! old version
 ! old version 5.1.5 uses full 3d attenuation arrays (set to .true.), custom_real for attenuation arrays (Qmu_store, tau_e_store)
   logical, parameter :: USE_VERSION_5_1_5 = .true.
-!  logical, parameter :: USE_3D_ATTENUATION_ARRAYS = .true.
 !  integer, parameter :: CUSTOM_REAL_ATT = CUSTOM_REAL
 ! new version
 ! new version uses full 3d attenuation, double precision for attenuation arrays (Qmu_store, tau_e_store)
 !!  logical, parameter :: USE_VERSION_5_1_5 = .false.
-!!  logical, parameter :: USE_3D_ATTENUATION_ARRAYS = .true.
 !!  integer,parameter :: CUSTOM_REAL_ATT = SIZE_DOUBLE
 
 

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_coupling_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_coupling_cuda.cu	2013-08-30 00:30:39 UTC (rev 22745)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_coupling_cuda.cu	2013-08-30 12:45:44 UTC (rev 22746)
@@ -172,7 +172,8 @@
 
 extern "C"
 void FC_FUNC_(compute_coupling_fluid_cmb_cuda,
-              COMPUTE_COUPLING_FLUID_CMB_CUDA)(long* Mesh_pointer_f) {
+              COMPUTE_COUPLING_FLUID_CMB_CUDA)(long* Mesh_pointer_f,
+                                               int* FORWARD_OR_ADJOINT) {
 
   TRACE("compute_coupling_fluid_cmb_cuda");
   //double start_time = get_time();
@@ -190,19 +191,19 @@
   dim3 threads(5,5,1);
 
   // launches GPU kernel
-  compute_coupling_fluid_CMB_kernel<<<grid,threads>>>(mp->d_displ_crust_mantle,
-                                                      mp->d_accel_outer_core,
-                                                      mp->d_ibool_crust_mantle,
-                                                      mp->d_ibelm_bottom_crust_mantle,
-                                                      mp->d_normal_top_outer_core,
-                                                      mp->d_jacobian2D_top_outer_core,
-                                                      mp->d_wgllwgll_xy,
-                                                      mp->d_ibool_outer_core,
-                                                      mp->d_ibelm_top_outer_core,
-                                                      mp->nspec2D_top_outer_core);
-
-  // adjoint simulations
-  if ( mp->simulation_type == 3 ){
+  if( *FORWARD_OR_ADJOINT == 1 ){
+    compute_coupling_fluid_CMB_kernel<<<grid,threads>>>(mp->d_displ_crust_mantle,
+                                                        mp->d_accel_outer_core,
+                                                        mp->d_ibool_crust_mantle,
+                                                        mp->d_ibelm_bottom_crust_mantle,
+                                                        mp->d_normal_top_outer_core,
+                                                        mp->d_jacobian2D_top_outer_core,
+                                                        mp->d_wgllwgll_xy,
+                                                        mp->d_ibool_outer_core,
+                                                        mp->d_ibelm_top_outer_core,
+                                                        mp->nspec2D_top_outer_core);
+  }else if( *FORWARD_OR_ADJOINT == 3 ){
+    // adjoint simulations
     compute_coupling_fluid_CMB_kernel<<<grid,threads>>>(mp->d_b_displ_crust_mantle,
                                                         mp->d_b_accel_outer_core,
                                                         mp->d_ibool_crust_mantle,
@@ -226,7 +227,8 @@
 
 extern "C"
 void FC_FUNC_(compute_coupling_fluid_icb_cuda,
-              COMPUTE_COUPLING_FLUID_ICB_CUDA)(long* Mesh_pointer_f) {
+              COMPUTE_COUPLING_FLUID_ICB_CUDA)(long* Mesh_pointer_f,
+                                               int* FORWARD_OR_ADJOINT) {
 
   TRACE("compute_coupling_fluid_icb_cuda");
   //double start_time = get_time();
@@ -244,19 +246,19 @@
   dim3 threads(5,5,1);
 
   // launches GPU kernel
-  compute_coupling_fluid_ICB_kernel<<<grid,threads>>>(mp->d_displ_inner_core,
-                                                      mp->d_accel_outer_core,
-                                                      mp->d_ibool_inner_core,
-                                                      mp->d_ibelm_top_inner_core,
-                                                      mp->d_normal_bottom_outer_core,
-                                                      mp->d_jacobian2D_bottom_outer_core,
-                                                      mp->d_wgllwgll_xy,
-                                                      mp->d_ibool_outer_core,
-                                                      mp->d_ibelm_bottom_outer_core,
-                                                      mp->nspec2D_bottom_outer_core);
-
-  // adjoint simulations
-  if ( mp->simulation_type == 3 ){
+  if( *FORWARD_OR_ADJOINT == 1 ){
+    compute_coupling_fluid_ICB_kernel<<<grid,threads>>>(mp->d_displ_inner_core,
+                                                        mp->d_accel_outer_core,
+                                                        mp->d_ibool_inner_core,
+                                                        mp->d_ibelm_top_inner_core,
+                                                        mp->d_normal_bottom_outer_core,
+                                                        mp->d_jacobian2D_bottom_outer_core,
+                                                        mp->d_wgllwgll_xy,
+                                                        mp->d_ibool_outer_core,
+                                                        mp->d_ibelm_bottom_outer_core,
+                                                        mp->nspec2D_bottom_outer_core);
+  }else if( *FORWARD_OR_ADJOINT == 3 ){
+    // adjoint simulations
     compute_coupling_fluid_ICB_kernel<<<grid,threads>>>(mp->d_b_displ_inner_core,
                                                         mp->d_b_accel_outer_core,
                                                         mp->d_ibool_inner_core,
@@ -268,6 +270,7 @@
                                                         mp->d_ibelm_bottom_outer_core,
                                                         mp->nspec2D_bottom_outer_core);
   }
+
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
   //double end_time = get_time();
   //printf("Elapsed time: %e\n",end_time-start_time);
@@ -420,7 +423,8 @@
 
 extern "C"
 void FC_FUNC_(compute_coupling_cmb_fluid_cuda,
-              COMPUTE_COUPLING_CMB_FLUID_CUDA)(long* Mesh_pointer_f) {
+              COMPUTE_COUPLING_CMB_FLUID_CUDA)(long* Mesh_pointer_f,
+                                               int* FORWARD_OR_ADJOINT) {
 
   TRACE("compute_coupling_cmb_fluid_cuda");
   //double start_time = get_time();
@@ -438,23 +442,23 @@
   dim3 threads(5,5,1);
 
   // launches GPU kernel
-  compute_coupling_CMB_fluid_kernel<<<grid,threads>>>(mp->d_displ_crust_mantle,
-                                                      mp->d_accel_crust_mantle,
-                                                      mp->d_accel_outer_core,
-                                                      mp->d_ibool_crust_mantle,
-                                                      mp->d_ibelm_bottom_crust_mantle,
-                                                      mp->d_normal_top_outer_core,
-                                                      mp->d_jacobian2D_top_outer_core,
-                                                      mp->d_wgllwgll_xy,
-                                                      mp->d_ibool_outer_core,
-                                                      mp->d_ibelm_top_outer_core,
-                                                      mp->RHO_TOP_OC,
-                                                      mp->minus_g_cmb,
-                                                      mp->gravity,
-                                                      mp->nspec2D_bottom_crust_mantle);
-
-  //  adjoint simulations
-  if ( mp->simulation_type == 3 ){
+  if( *FORWARD_OR_ADJOINT == 1 ){
+    compute_coupling_CMB_fluid_kernel<<<grid,threads>>>(mp->d_displ_crust_mantle,
+                                                        mp->d_accel_crust_mantle,
+                                                        mp->d_accel_outer_core,
+                                                        mp->d_ibool_crust_mantle,
+                                                        mp->d_ibelm_bottom_crust_mantle,
+                                                        mp->d_normal_top_outer_core,
+                                                        mp->d_jacobian2D_top_outer_core,
+                                                        mp->d_wgllwgll_xy,
+                                                        mp->d_ibool_outer_core,
+                                                        mp->d_ibelm_top_outer_core,
+                                                        mp->RHO_TOP_OC,
+                                                        mp->minus_g_cmb,
+                                                        mp->gravity,
+                                                        mp->nspec2D_bottom_crust_mantle);
+  }else if( *FORWARD_OR_ADJOINT == 3 ){
+    //  adjoint simulations
     compute_coupling_CMB_fluid_kernel<<<grid,threads>>>(mp->d_b_displ_crust_mantle,
                                                         mp->d_b_accel_crust_mantle,
                                                         mp->d_b_accel_outer_core,
@@ -482,7 +486,8 @@
 
 extern "C"
 void FC_FUNC_(compute_coupling_icb_fluid_cuda,
-              COMPUTE_COUPLING_ICB_FLUID_CUDA)(long* Mesh_pointer_f) {
+              COMPUTE_COUPLING_ICB_FLUID_CUDA)(long* Mesh_pointer_f,
+                                               int* FORWARD_OR_ADJOINT) {
 
   TRACE("compute_coupling_icb_fluid_cuda");
   //double start_time = get_time();
@@ -500,23 +505,23 @@
   dim3 threads(5,5,1);
 
   // launches GPU kernel
-  compute_coupling_ICB_fluid_kernel<<<grid,threads>>>(mp->d_displ_inner_core,
-                                                      mp->d_accel_inner_core,
-                                                      mp->d_accel_outer_core,
-                                                      mp->d_ibool_inner_core,
-                                                      mp->d_ibelm_top_inner_core,
-                                                      mp->d_normal_bottom_outer_core,
-                                                      mp->d_jacobian2D_bottom_outer_core,
-                                                      mp->d_wgllwgll_xy,
-                                                      mp->d_ibool_outer_core,
-                                                      mp->d_ibelm_bottom_outer_core,
-                                                      mp->RHO_BOTTOM_OC,
-                                                      mp->minus_g_icb,
-                                                      mp->gravity,
-                                                      mp->nspec2D_top_inner_core);
-
-  //  adjoint simulations
-  if ( mp->simulation_type == 3 ){
+  if( *FORWARD_OR_ADJOINT == 1 ){
+    compute_coupling_ICB_fluid_kernel<<<grid,threads>>>(mp->d_displ_inner_core,
+                                                        mp->d_accel_inner_core,
+                                                        mp->d_accel_outer_core,
+                                                        mp->d_ibool_inner_core,
+                                                        mp->d_ibelm_top_inner_core,
+                                                        mp->d_normal_bottom_outer_core,
+                                                        mp->d_jacobian2D_bottom_outer_core,
+                                                        mp->d_wgllwgll_xy,
+                                                        mp->d_ibool_outer_core,
+                                                        mp->d_ibelm_bottom_outer_core,
+                                                        mp->RHO_BOTTOM_OC,
+                                                        mp->minus_g_icb,
+                                                        mp->gravity,
+                                                        mp->nspec2D_top_inner_core);
+  }else if( *FORWARD_OR_ADJOINT == 3 ){
+    //  adjoint simulations
     compute_coupling_ICB_fluid_kernel<<<grid,threads>>>(mp->d_b_displ_inner_core,
                                                         mp->d_b_accel_inner_core,
                                                         mp->d_b_accel_outer_core,
@@ -625,6 +630,7 @@
 
   if( ( *NCHUNKS_VAL != 6 && mp->absorbing_conditions || (mp->rotation && *exact_mass_matrix_for_rotation)) &&
       ! *use_lddrk ){
+    // uses corrected mass matrices
     if( *FORWARD_OR_ADJOINT == 1 ){
       compute_coupling_ocean_cuda_kernel<<<grid,threads>>>(mp->d_accel_crust_mantle,
                                                            mp->d_rmassx_crust_mantle,
@@ -646,6 +652,7 @@
                                                            mp->d_normal_ocean_load);
     }
   }else{
+    // uses only rmassz
     if( *FORWARD_OR_ADJOINT == 1 ){
       compute_coupling_ocean_cuda_kernel<<<grid,threads>>>(mp->d_accel_crust_mantle,
                                                            mp->d_rmassz_crust_mantle,

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_outer_core_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_outer_core_cuda.cu	2013-08-30 00:30:39 UTC (rev 22745)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_outer_core_cuda.cu	2013-08-30 12:45:44 UTC (rev 22746)
@@ -169,9 +169,11 @@
 #endif
 
   __shared__ realw s_dummy_loc[NGLL3];
+
   __shared__ realw s_temp1[NGLL3];
   __shared__ realw s_temp2[NGLL3];
   __shared__ realw s_temp3[NGLL3];
+
   __shared__ realw sh_hprime_xx[NGLL2];
   __shared__ realw sh_hprimewgll_xx[NGLL2];
 
@@ -461,11 +463,12 @@
                          realw* d_xix,realw* d_xiy,realw* d_xiz,
                          realw* d_etax,realw* d_etay,realw* d_etaz,
                          realw* d_gammax,realw* d_gammay,realw* d_gammaz,
-                         realw time, realw b_time,
+                         realw time,
                          realw* d_A_array_rotation,
                          realw* d_B_array_rotation,
                          realw* d_b_A_array_rotation,
-                         realw* d_b_B_array_rotation){
+                         realw* d_b_B_array_rotation,
+                         int FORWARD_OR_ADJOINT){
 
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
   exit_on_cuda_error("before outer_core kernel Kernel_2");
@@ -497,7 +500,8 @@
   // cudaEventCreate(&stop);
   // cudaEventRecord( start, 0 );
 
-  Kernel_2_outer_core_impl<<<grid,threads>>>(nb_blocks_to_compute,
+  if( FORWARD_OR_ADJOINT == 1 ){
+    Kernel_2_outer_core_impl<<<grid,threads>>>(nb_blocks_to_compute,
                                                           mp->NGLOB_OUTER_CORE,
                                                           d_ibool,
                                                           mp->d_phase_ispec_inner_outer_core,
@@ -524,8 +528,7 @@
                                                           d_A_array_rotation,
                                                           d_B_array_rotation,
                                                           mp->NSPEC_OUTER_CORE);
-
-  if(mp->simulation_type == 3) {
+  }else if( FORWARD_OR_ADJOINT == 3 ){
     Kernel_2_outer_core_impl<<<grid,threads>>>(nb_blocks_to_compute,
                                                             mp->NGLOB_OUTER_CORE,
                                                             d_ibool,
@@ -547,7 +550,7 @@
                                                             mp->d_minus_rho_g_over_kappa_fluid,
                                                             mp->d_wgll_cube,
                                                             mp->rotation,
-                                                            b_time,
+                                                            time,
                                                             mp->b_two_omega_earth,
                                                             mp->b_deltat,
                                                             d_b_A_array_rotation,
@@ -580,7 +583,7 @@
               COMPUTE_FORCES_OUTER_CORE_CUDA)(long* Mesh_pointer_f,
                                               int* iphase,
                                               realw* time_f,
-                                              realw* b_time_f) {
+                                              int* FORWARD_OR_ADJOINT) {
 
   TRACE("compute_forces_outer_core_cuda");
 
@@ -590,7 +593,6 @@
 
   Mesh* mp = (Mesh*)(*Mesh_pointer_f); // get Mesh from fortran integer wrapper
   realw time = *time_f;
-  realw b_time = *b_time_f;
 
   int num_elements;
 
@@ -676,11 +678,12 @@
                           mp->d_gammax_outer_core + color_offset,
                           mp->d_gammay_outer_core + color_offset,
                           mp->d_gammaz_outer_core + color_offset,
-                          time,b_time,
+                          time,
                           mp->d_A_array_rotation + color_offset_nonpadded,
                           mp->d_B_array_rotation + color_offset_nonpadded,
                           mp->d_b_A_array_rotation + color_offset_nonpadded,
-                          mp->d_b_B_array_rotation + color_offset_nonpadded);
+                          mp->d_b_B_array_rotation + color_offset_nonpadded,
+                          *FORWARD_OR_ADJOINT);
 
       // for padded and aligned arrays
       color_offset += nb_blocks_to_compute * NGLL3_PADDED;
@@ -697,11 +700,12 @@
                         mp->d_xix_outer_core,mp->d_xiy_outer_core,mp->d_xiz_outer_core,
                         mp->d_etax_outer_core,mp->d_etay_outer_core,mp->d_etaz_outer_core,
                         mp->d_gammax_outer_core,mp->d_gammay_outer_core,mp->d_gammaz_outer_core,
-                        time,b_time,
+                        time,
                         mp->d_A_array_rotation,
                         mp->d_B_array_rotation,
                         mp->d_b_A_array_rotation,
-                        mp->d_b_B_array_rotation);
+                        mp->d_b_B_array_rotation,
+                        *FORWARD_OR_ADJOINT);
 
   }
 

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_stacey_acoustic_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_stacey_acoustic_cuda.cu	2013-08-30 00:30:39 UTC (rev 22745)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_stacey_acoustic_cuda.cu	2013-08-30 12:45:44 UTC (rev 22746)
@@ -51,9 +51,7 @@
                                                realw* wgllwgll,
                                                int* ibool,
                                                realw* vpstore,
-                                               int SIMULATION_TYPE,
                                                int SAVE_FORWARD,
-                                               realw* b_potential_dot_dot_acoustic,
                                                realw* b_absorb_potential) {
 
   int igll = threadIdx.x;
@@ -157,10 +155,7 @@
     atomicAdd(&potential_dot_dot_acoustic[iglob],-sn*jacobianw);
 
     // adjoint simulations
-    if( SIMULATION_TYPE == 3 ){
-      // Sommerfeld condition
-      atomicAdd(&b_potential_dot_dot_acoustic[iglob],-b_absorb_potential[INDEX2(NGLL2,igll,iface)]);
-    }else if( SIMULATION_TYPE == 1 && SAVE_FORWARD ){
+    if( SAVE_FORWARD ){
       // saves boundary values
       b_absorb_potential[INDEX2(NGLL2,igll,iface)] = sn*jacobianw;
     }
@@ -259,14 +254,6 @@
   dim3 grid(num_blocks_x,num_blocks_y);
   dim3 threads(blocksize,1,1);
 
-  //  adjoint simulations: needs absorbing boundary buffer
-  if (mp->simulation_type == 3 && num_abs_boundary_faces > 0 ){
-    // copies array to GPU
-    print_CUDA_error_if_any(cudaMemcpy(d_b_absorb_potential,absorb_potential,
-                            NGLL2*num_abs_boundary_faces*sizeof(realw),
-                            cudaMemcpyHostToDevice),7700);
-  }
-
   compute_stacey_acoustic_kernel<<<grid,threads>>>(mp->d_veloc_outer_core,
                                                    mp->d_accel_outer_core,
                                                    interface_type,
@@ -282,13 +269,11 @@
                                                    d_wgllwgll,
                                                    mp->d_ibool_outer_core,
                                                    mp->d_vp_outer_core,
-                                                   mp->simulation_type,
                                                    mp->save_forward,
-                                                   mp->d_b_accel_outer_core,
                                                    d_b_absorb_potential);
 
   //  adjoint simulations: stores absorbed wavefield part
-  if (mp->simulation_type == 1 && mp->save_forward && num_abs_boundary_faces > 0 ){
+  if( mp->save_forward && num_abs_boundary_faces > 0 ){
     // copies array to CPU
     print_CUDA_error_if_any(cudaMemcpy(absorb_potential,d_b_absorb_potential,
                                        NGLL2*num_abs_boundary_faces*sizeof(realw),
@@ -300,3 +285,206 @@
 #endif
 }
 
+/* ----------------------------------------------------------------------------------------------- */
+
+__global__ void compute_stacey_acoustic_backward_kernel(realw* b_potential_dot_dot_acoustic,
+                                                        realw* b_absorb_potential,
+                                                        int interface_type,
+                                                        int num_abs_boundary_faces,
+                                                        int* abs_boundary_ispec,
+                                                        int* nkmin_xi, int* nkmin_eta,
+                                                        int* njmin, int* njmax,
+                                                        int* nimin, int* nimax,
+                                                        int* ibool) {
+
+  int igll = threadIdx.x;
+  int iface = blockIdx.x + gridDim.x*blockIdx.y;
+
+  int i,j,k,iglob,ispec;
+
+  // don't compute points outside NGLLSQUARE==NGLL2==25
+  // way 2: no further check needed since blocksize = 25
+  if( iface < num_abs_boundary_faces){
+
+  //  if(igll<NGLL2 && iface < num_abs_boundary_faces) {
+
+    // "-1" from index values to convert from Fortran-> C indexing
+    ispec = abs_boundary_ispec[iface]-1;
+
+    // determines indices i,j,k depending on absorbing boundary type
+    switch( interface_type ){
+      case 4:
+        // xmin
+        if( nkmin_xi[INDEX2(2,0,iface)] == 0 || njmin[INDEX2(2,0,iface)] == 0 ) return;
+
+        i = 0; // index -1
+        k = (igll/NGLLX);
+        j = (igll-k*NGLLX);
+
+        if( k < nkmin_xi[INDEX2(2,0,iface)]-1 || k > NGLLX-1 ) return;
+        if( j < njmin[INDEX2(2,0,iface)]-1 || j > njmax[INDEX2(2,0,iface)]-1 ) return;
+
+        break;
+
+      case 5:
+        // xmax
+        if( nkmin_xi[INDEX2(2,1,iface)] == 0 || njmin[INDEX2(2,1,iface)] == 0 ) return;
+
+        i = NGLLX-1;
+        k = (igll/NGLLX);
+        j = (igll-k*NGLLX);
+
+        if( k < nkmin_xi[INDEX2(2,1,iface)]-1 || k > NGLLX-1 ) return;
+        if( j < njmin[INDEX2(2,1,iface)]-1 || j > njmax[INDEX2(2,1,iface)]-1 ) return;
+
+        break;
+
+      case 6:
+        // ymin
+        if( nkmin_eta[INDEX2(2,0,iface)] == 0 || nimin[INDEX2(2,0,iface)] == 0 ) return;
+
+        j = 0;
+        k = (igll/NGLLX);
+        i = (igll-k*NGLLX);
+
+        if( k < nkmin_eta[INDEX2(2,0,iface)]-1 || k > NGLLX-1 ) return;
+        if( i < nimin[INDEX2(2,0,iface)]-1 || i > nimax[INDEX2(2,0,iface)]-1 ) return;
+
+        break;
+
+      case 7:
+        // ymax
+        if( nkmin_eta[INDEX2(2,1,iface)] == 0 || nimin[INDEX2(2,1,iface)] == 0 ) return;
+
+        j = NGLLX-1;
+        k = (igll/NGLLX);
+        i = (igll-k*NGLLX);
+
+        if( k < nkmin_eta[INDEX2(2,1,iface)]-1 || k > NGLLX-1 ) return;
+        if( i < nimin[INDEX2(2,1,iface)]-1 || i > nimax[INDEX2(2,1,iface)]-1 ) return;
+
+        break;
+
+      case 8:
+        // zmin
+        k = 0;
+        j = (igll/NGLLX);
+        i = (igll-j*NGLLX);
+
+        if( j < 0 || j > NGLLX-1 ) return;
+        if( i < 0 || i > NGLLX-1 ) return;
+
+        break;
+
+    }
+
+    iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)]-1;
+
+    // Sommerfeld condition
+    atomicAdd(&b_potential_dot_dot_acoustic[iglob],-b_absorb_potential[INDEX2(NGLL2,igll,iface)]);
+  }
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(compute_stacey_acoustic_backward_cuda,
+              COMPUTE_STACEY_ACOUSTIC_BACKWARD_CUDA)(long* Mesh_pointer_f,
+                                                     realw* absorb_potential,
+                                                     int* itype) {
+TRACE("compute_stacey_acoustic_backward_cuda");
+  //double start_time = get_time();
+
+  int num_abs_boundary_faces;
+  int* d_abs_boundary_ispec;
+  realw* d_b_absorb_potential;
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+  // absorbing boundary type
+  int interface_type = *itype;
+  switch( interface_type ){
+    case 4:
+      // xmin
+      num_abs_boundary_faces = mp->nspec2D_xmin_outer_core;
+      d_abs_boundary_ispec = mp->d_ibelm_xmin_outer_core;
+      d_b_absorb_potential = mp->d_absorb_xmin_outer_core;
+      break;
+
+    case 5:
+      // xmax
+      num_abs_boundary_faces = mp->nspec2D_xmax_outer_core;
+      d_abs_boundary_ispec = mp->d_ibelm_xmax_outer_core;
+      d_b_absorb_potential = mp->d_absorb_xmax_outer_core;
+      break;
+
+    case 6:
+      // ymin
+      num_abs_boundary_faces = mp->nspec2D_ymin_outer_core;
+      d_abs_boundary_ispec = mp->d_ibelm_ymin_outer_core;
+      d_b_absorb_potential = mp->d_absorb_ymin_outer_core;
+      break;
+
+    case 7:
+      // ymax
+      num_abs_boundary_faces = mp->nspec2D_ymax_outer_core;
+      d_abs_boundary_ispec = mp->d_ibelm_ymax_outer_core;
+      d_b_absorb_potential = mp->d_absorb_ymax_outer_core;
+      break;
+
+    case 8:
+      // zmin
+      num_abs_boundary_faces = mp->nspec2D_zmin_outer_core;
+      d_abs_boundary_ispec = mp->d_ibelm_bottom_outer_core;
+      d_b_absorb_potential = mp->d_absorb_zmin_outer_core;
+      break;
+
+    default:
+      exit_on_cuda_error("compute_stacey_acoustic_cuda: unknown interface type");
+      break;
+  }
+
+  // checks if anything to do
+  if( num_abs_boundary_faces == 0 ) return;
+
+  // way 1: Elapsed time: 4.385948e-03
+  // > NGLLSQUARE==NGLL2==25, but we handle this inside kernel
+  //  int blocksize = 32;
+
+  // way 2: Elapsed time: 4.379034e-03
+  // > NGLLSQUARE==NGLL2==25, no further check inside kernel
+  int blocksize = NGLL2;
+
+  int num_blocks_x = num_abs_boundary_faces;
+  int num_blocks_y = 1;
+  while(num_blocks_x > MAXIMUM_GRID_DIM) {
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+    num_blocks_y = num_blocks_y*2;
+  }
+  dim3 grid(num_blocks_x,num_blocks_y);
+  dim3 threads(blocksize,1,1);
+
+  //  adjoint simulations: needs absorbing boundary buffer
+  if( num_abs_boundary_faces > 0 ){
+    // copies array to GPU
+    print_CUDA_error_if_any(cudaMemcpy(d_b_absorb_potential,absorb_potential,
+                                       NGLL2*num_abs_boundary_faces*sizeof(realw),
+                                       cudaMemcpyHostToDevice),7700);
+  }
+
+  compute_stacey_acoustic_backward_kernel<<<grid,threads>>>(mp->d_b_accel_outer_core,
+                                                            d_b_absorb_potential,
+                                                            interface_type,
+                                                            num_abs_boundary_faces,
+                                                            d_abs_boundary_ispec,
+                                                            mp->d_nkmin_xi_outer_core,mp->d_nkmin_eta_outer_core,
+                                                            mp->d_njmin_outer_core,mp->d_njmax_outer_core,
+                                                            mp->d_nimin_outer_core,mp->d_nimax_outer_core,
+                                                            mp->d_ibool_outer_core);
+
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("compute_stacey_acoustic_backward_kernel");
+#endif
+}
+

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_stacey_elastic_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_stacey_elastic_cuda.cu	2013-08-30 00:30:39 UTC (rev 22745)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_stacey_elastic_cuda.cu	2013-08-30 12:45:44 UTC (rev 22746)
@@ -53,9 +53,7 @@
                                               int* ibool,
                                               realw* rho_vp,
                                               realw* rho_vs,
-                                              int SIMULATION_TYPE,
                                               int SAVE_FORWARD,
-                                              realw* b_accel,
                                               realw* b_absorb_field) {
 
   int igll = threadIdx.x; // tx
@@ -166,12 +164,7 @@
     atomicAdd(&accel[iglob*3+1],-ty*jacobianw);
     atomicAdd(&accel[iglob*3+2],-tz*jacobianw);
 
-    if(SIMULATION_TYPE == 3) {
-      atomicAdd(&b_accel[iglob*3  ],-b_absorb_field[INDEX3(NDIM,NGLL2,0,igll,iface)]);
-      atomicAdd(&b_accel[iglob*3+1],-b_absorb_field[INDEX3(NDIM,NGLL2,1,igll,iface)]);
-      atomicAdd(&b_accel[iglob*3+2],-b_absorb_field[INDEX3(NDIM,NGLL2,2,igll,iface)]);
-    }
-    else if(SAVE_FORWARD && SIMULATION_TYPE == 1) {
+    if( SAVE_FORWARD ){
       b_absorb_field[INDEX3(NDIM,NGLL2,0,igll,iface)] = tx*jacobianw;
       b_absorb_field[INDEX3(NDIM,NGLL2,1,igll,iface)] = ty*jacobianw;
       b_absorb_field[INDEX3(NDIM,NGLL2,2,igll,iface)] = tz*jacobianw;
@@ -186,8 +179,8 @@
 extern "C"
 void FC_FUNC_(compute_stacey_elastic_cuda,
               COMPUTE_STACEY_ELASTIC_CUDA)(long* Mesh_pointer_f,
-                                                realw* absorb_field,
-                                                int* itype) {
+                                           realw* absorb_field,
+                                           int* itype) {
 
 TRACE("compute_stacey_elastic_cuda");
 
@@ -268,13 +261,6 @@
   dim3 grid(num_blocks_x,num_blocks_y);
   dim3 threads(blocksize,1,1);
 
-  // adjoint simulations: needs absorbing boundary buffer
-  if(mp->simulation_type == 3 && num_abs_boundary_faces > 0) {
-    // copies array to GPU
-    print_CUDA_error_if_any(cudaMemcpy(d_b_absorb_field,absorb_field,
-                            NDIM*NGLL2*num_abs_boundary_faces*sizeof(realw),cudaMemcpyHostToDevice),7700);
-  }
-
   // absorbing boundary contributions
   compute_stacey_elastic_kernel<<<grid,threads>>>(mp->d_veloc_crust_mantle,
                                                   mp->d_accel_crust_mantle,
@@ -293,14 +279,12 @@
                                                   mp->d_ibool_crust_mantle,
                                                   mp->d_rho_vp_crust_mantle,
                                                   mp->d_rho_vs_crust_mantle,
-                                                  mp->simulation_type,
                                                   mp->save_forward,
-                                                  mp->d_b_accel_crust_mantle,
                                                   d_b_absorb_field);
 
 
   // adjoint simulations: stores absorbed wavefield part
-  if(mp->simulation_type == 1 && mp->save_forward && num_abs_boundary_faces > 0 ) {
+  if(mp->save_forward && num_abs_boundary_faces > 0 ) {
     // copies array to CPU
     print_CUDA_error_if_any(cudaMemcpy(absorb_field,d_b_absorb_field,
                             NDIM*NGLL2*num_abs_boundary_faces*sizeof(realw),cudaMemcpyDeviceToHost),7701);
@@ -311,3 +295,197 @@
 #endif
 }
 
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// backward/reconstructed wavefields
+
+/* ----------------------------------------------------------------------------------------------- */
+
+__global__ void compute_stacey_elastic_backward_kernel(realw* b_accel,
+                                                       realw* b_absorb_field,
+                                                       int interface_type,
+                                                       int num_abs_boundary_faces,
+                                                       int* abs_boundary_ispec,
+                                                       int* nkmin_xi, int* nkmin_eta,
+                                                       int* njmin, int* njmax,
+                                                       int* nimin, int* nimax,
+                                                       int* ibool) {
+
+  int igll = threadIdx.x; // tx
+  int iface = blockIdx.x + gridDim.x*blockIdx.y; // bx
+
+  int i,j,k,iglob,ispec;
+
+  // don't compute surface faces outside of range
+  // and don't compute points outside NGLLSQUARE==NGLL2==25
+  //if(igll < NGLL2 && iface < num_abs_boundary_faces) {
+
+  // way 2: only check face, no further check needed since blocksize = 25
+  if( iface < num_abs_boundary_faces){
+
+    // "-1" from index values to convert from Fortran-> C indexing
+    ispec = abs_boundary_ispec[iface]-1;
+
+    // determines indices i,j,k depending on absorbing boundary type
+    switch( interface_type ){
+      case 0:
+        // xmin
+        if( nkmin_xi[INDEX2(2,0,iface)] == 0 || njmin[INDEX2(2,0,iface)] == 0 ) return;
+
+        i = 0; // index -1
+        k = (igll/NGLLX);
+        j = (igll-k*NGLLX);
+
+        if( k < nkmin_xi[INDEX2(2,0,iface)]-1 || k > NGLLX-1 ) return;
+        if( j < njmin[INDEX2(2,0,iface)]-1 || j > NGLLX-1 ) return;
+
+        break;
+
+      case 1:
+        // xmax
+        if( nkmin_xi[INDEX2(2,1,iface)] == 0 || njmin[INDEX2(2,1,iface)] == 0 ) return;
+
+        i = NGLLX-1;
+        k = (igll/NGLLX);
+        j = (igll-k*NGLLX);
+
+        if( k < nkmin_xi[INDEX2(2,1,iface)]-1 || k > NGLLX-1 ) return;
+        if( j < njmin[INDEX2(2,1,iface)]-1 || j > njmax[INDEX2(2,1,iface)]-1 ) return;
+
+        break;
+
+      case 2:
+        // ymin
+        if( nkmin_eta[INDEX2(2,0,iface)] == 0 || nimin[INDEX2(2,0,iface)] == 0 ) return;
+
+        j = 0;
+        k = (igll/NGLLX);
+        i = (igll-k*NGLLX);
+
+        if( k < nkmin_eta[INDEX2(2,0,iface)]-1 || k > NGLLX-1 ) return;
+        if( i < nimin[INDEX2(2,0,iface)]-1 || i > nimax[INDEX2(2,0,iface)]-1 ) return;
+
+        break;
+
+      case 3:
+        // ymax
+        if( nkmin_eta[INDEX2(2,1,iface)] == 0 || nimin[INDEX2(2,1,iface)] == 0 ) return;
+
+        j = NGLLX-1;
+        k = (igll/NGLLX);
+        i = (igll-k*NGLLX);
+
+        if( k < nkmin_eta[INDEX2(2,1,iface)]-1 || k > NGLLX-1 ) return;
+        if( i < nimin[INDEX2(2,1,iface)]-1 || i > nimax[INDEX2(2,1,iface)]-1 ) return;
+
+        break;
+    }
+
+    iglob = ibool[INDEX4(NGLLX,NGLLX,NGLLX,i,j,k,ispec)]-1;
+
+    atomicAdd(&b_accel[iglob*3  ],-b_absorb_field[INDEX3(NDIM,NGLL2,0,igll,iface)]);
+    atomicAdd(&b_accel[iglob*3+1],-b_absorb_field[INDEX3(NDIM,NGLL2,1,igll,iface)]);
+    atomicAdd(&b_accel[iglob*3+2],-b_absorb_field[INDEX3(NDIM,NGLL2,2,igll,iface)]);
+
+  } // num_abs_boundary_faces
+}
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+
+extern "C"
+void FC_FUNC_(compute_stacey_elastic_backward_cuda,
+              COMPUTE_STACEY_ELASTIC_BACKWARD_CUDA)(long* Mesh_pointer_f,
+                                                    realw* absorb_field,
+                                                    int* itype) {
+
+TRACE("compute_stacey_elastic_backward_cuda");
+
+  int num_abs_boundary_faces;
+  int* d_abs_boundary_ispec;
+  realw* d_b_absorb_field;
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+  // absorbing boundary type
+  int interface_type = *itype;
+  switch( interface_type ){
+    case 0:
+      // xmin
+      num_abs_boundary_faces = mp->nspec2D_xmin_crust_mantle;
+      d_abs_boundary_ispec = mp->d_ibelm_xmin_crust_mantle;
+      d_b_absorb_field = mp->d_absorb_xmin_crust_mantle;
+      break;
+
+    case 1:
+      // xmax
+      num_abs_boundary_faces = mp->nspec2D_xmax_crust_mantle;
+      d_abs_boundary_ispec = mp->d_ibelm_xmax_crust_mantle;
+      d_b_absorb_field = mp->d_absorb_xmax_crust_mantle;
+      break;
+
+    case 2:
+      // ymin
+      num_abs_boundary_faces = mp->nspec2D_ymin_crust_mantle;
+      d_abs_boundary_ispec = mp->d_ibelm_ymin_crust_mantle;
+      d_b_absorb_field = mp->d_absorb_ymin_crust_mantle;
+      break;
+
+    case 3:
+      // ymax
+      num_abs_boundary_faces = mp->nspec2D_ymax_crust_mantle;
+      d_abs_boundary_ispec = mp->d_ibelm_ymax_crust_mantle;
+      d_b_absorb_field = mp->d_absorb_ymax_crust_mantle;
+      break;
+
+    default:
+      exit_on_cuda_error("compute_stacey_elastic_cuda: unknown interface type");
+      break;
+  }
+
+  // checks if anything to do
+  if( num_abs_boundary_faces == 0 ) return;
+
+  // way 1
+  // > NGLLSQUARE==NGLL2==25, but we handle this inside kernel
+  //int blocksize = 32;
+
+  // way 2: seems sligthly faster
+  // > NGLLSQUARE==NGLL2==25, no further check inside kernel
+  int blocksize = NGLL2;
+
+  int num_blocks_x = num_abs_boundary_faces;
+  int num_blocks_y = 1;
+  while(num_blocks_x > MAXIMUM_GRID_DIM) {
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+    num_blocks_y = num_blocks_y*2;
+  }
+  dim3 grid(num_blocks_x,num_blocks_y);
+  dim3 threads(blocksize,1,1);
+
+  // adjoint simulations: needs absorbing boundary buffer
+  if( num_abs_boundary_faces > 0 ){
+    // copies array to GPU
+    print_CUDA_error_if_any(cudaMemcpy(d_b_absorb_field,absorb_field,
+                                       NDIM*NGLL2*num_abs_boundary_faces*sizeof(realw),cudaMemcpyHostToDevice),7700);
+  }
+
+  // absorbing boundary contributions
+  compute_stacey_elastic_backward_kernel<<<grid,threads>>>(mp->d_b_accel_crust_mantle,
+                                                           d_b_absorb_field,
+                                                           interface_type,
+                                                           num_abs_boundary_faces,
+                                                           d_abs_boundary_ispec,
+                                                           mp->d_nkmin_xi_crust_mantle,mp->d_nkmin_eta_crust_mantle,
+                                                           mp->d_njmin_crust_mantle,mp->d_njmax_crust_mantle,
+                                                           mp->d_nimin_crust_mantle,mp->d_nimax_crust_mantle,
+                                                           mp->d_ibool_crust_mantle);
+
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("compute_stacey_elastic_backward_cuda");
+#endif
+}
+

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/it_update_displacement_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/it_update_displacement_cuda.cu	2013-08-30 00:30:39 UTC (rev 22745)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/it_update_displacement_cuda.cu	2013-08-30 12:45:44 UTC (rev 22746)
@@ -72,9 +72,7 @@
                                              realw* deltat_F,
                                              realw* deltatsqover2_F,
                                              realw* deltatover2_F,
-                                             realw* b_deltat_F,
-                                             realw* b_deltatsqover2_F,
-                                             realw* b_deltatover2_F) {
+                                             int* FORWARD_OR_ADJOINT) {
 
 TRACE("it_update_displacement_ic_cuda");
 
@@ -99,22 +97,18 @@
   dim3 grid(num_blocks_x,num_blocks_y);
   dim3 threads(blocksize,1,1);
 
-  //launch kernel
-  UpdateDispVeloc_kernel<<<grid,threads>>>(mp->d_displ_inner_core,
-                                           mp->d_veloc_inner_core,
-                                           mp->d_accel_inner_core,
-                                           size,deltat,deltatsqover2,deltatover2);
-
-  // kernel for backward fields
-  if(mp->simulation_type == 3) {
-    realw b_deltat = *b_deltat_F;
-    realw b_deltatsqover2 = *b_deltatsqover2_F;
-    realw b_deltatover2 = *b_deltatover2_F;
-
+  if( *FORWARD_OR_ADJOINT == 1 ){
+    //launch kernel
+    UpdateDispVeloc_kernel<<<grid,threads>>>(mp->d_displ_inner_core,
+                                             mp->d_veloc_inner_core,
+                                             mp->d_accel_inner_core,
+                                             size,deltat,deltatsqover2,deltatover2);
+  }else if( *FORWARD_OR_ADJOINT == 3 ){
+    // kernel for backward fields
     UpdateDispVeloc_kernel<<<grid,threads>>>(mp->d_b_displ_inner_core,
                                              mp->d_b_veloc_inner_core,
                                              mp->d_b_accel_inner_core,
-                                             size,b_deltat,b_deltatsqover2,b_deltatover2);
+                                             size,deltat,deltatsqover2,deltatover2);
   }
 
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
@@ -135,9 +129,7 @@
                                              realw* deltat_F,
                                              realw* deltatsqover2_F,
                                              realw* deltatover2_F,
-                                             realw* b_deltat_F,
-                                             realw* b_deltatsqover2_F,
-                                             realw* b_deltatover2_F) {
+                                             int* FORWARD_OR_ADJOINT) {
 
   TRACE("it_update_displacement_cm_cuda");
 
@@ -162,22 +154,18 @@
   dim3 grid(num_blocks_x,num_blocks_y);
   dim3 threads(blocksize,1,1);
 
-  //launch kernel
-  UpdateDispVeloc_kernel<<<grid,threads>>>(mp->d_displ_crust_mantle,
-                                           mp->d_veloc_crust_mantle,
-                                           mp->d_accel_crust_mantle,
-                                           size,deltat,deltatsqover2,deltatover2);
-
-  // kernel for backward fields
-  if(mp->simulation_type == 3) {
-    realw b_deltat = *b_deltat_F;
-    realw b_deltatsqover2 = *b_deltatsqover2_F;
-    realw b_deltatover2 = *b_deltatover2_F;
-
+  if( *FORWARD_OR_ADJOINT == 1 ){
+    //launch kernel
+    UpdateDispVeloc_kernel<<<grid,threads>>>(mp->d_displ_crust_mantle,
+                                             mp->d_veloc_crust_mantle,
+                                             mp->d_accel_crust_mantle,
+                                             size,deltat,deltatsqover2,deltatover2);
+  }else if( *FORWARD_OR_ADJOINT == 3 ){
+    // kernel for backward fields
     UpdateDispVeloc_kernel<<<grid,threads>>>(mp->d_b_displ_crust_mantle,
                                              mp->d_b_veloc_crust_mantle,
                                              mp->d_b_accel_crust_mantle,
-                                             size,b_deltat,b_deltatsqover2,b_deltatover2);
+                                             size,deltat,deltatsqover2,deltatover2);
   }
 
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
@@ -226,12 +214,10 @@
 extern "C"
 void FC_FUNC_(it_update_displacement_oc_cuda,
               IT_UPDATE_DISPLACEMENT_OC_cuda)(long* Mesh_pointer_f,
-                                               realw* deltat_F,
-                                               realw* deltatsqover2_F,
-                                               realw* deltatover2_F,
-                                               realw* b_deltat_F,
-                                               realw* b_deltatsqover2_F,
-                                               realw* b_deltatover2_F) {
+                                              realw* deltat_F,
+                                              realw* deltatsqover2_F,
+                                              realw* deltatover2_F,
+                                              int* FORWARD_OR_ADJOINT) {
 
   TRACE("it_update_displacement_oc_cuda");
 
@@ -256,21 +242,17 @@
   dim3 grid(num_blocks_x,num_blocks_y);
   dim3 threads(blocksize,1,1);
 
-  //launch kernel
-  UpdatePotential_kernel<<<grid,threads>>>(mp->d_displ_outer_core,
-                                           mp->d_veloc_outer_core,
-                                           mp->d_accel_outer_core,
-                                           size,deltat,deltatsqover2,deltatover2);
-
-  if(mp->simulation_type == 3) {
-    realw b_deltat = *b_deltat_F;
-    realw b_deltatsqover2 = *b_deltatsqover2_F;
-    realw b_deltatover2 = *b_deltatover2_F;
-
+  if( *FORWARD_OR_ADJOINT == 1 ){
+    //launch kernel
+    UpdatePotential_kernel<<<grid,threads>>>(mp->d_displ_outer_core,
+                                             mp->d_veloc_outer_core,
+                                             mp->d_accel_outer_core,
+                                             size,deltat,deltatsqover2,deltatover2);
+  }else if( *FORWARD_OR_ADJOINT == 1 ){
     UpdatePotential_kernel<<<grid,threads>>>(mp->d_b_displ_outer_core,
                                              mp->d_b_veloc_outer_core,
                                              mp->d_b_accel_outer_core,
-                                             size,b_deltat,b_deltatsqover2,b_deltatover2);
+                                             size,deltat,deltatsqover2,deltatover2);
   }
 
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
@@ -352,20 +334,16 @@
 /* ----------------------------------------------------------------------------------------------- */
 
 extern "C"
-void FC_FUNC_(kernel_3_a_cuda,
-              KERNEL_3_A_CUDA)(long* Mesh_pointer,
-                               realw* deltatover2_F,
-                               int* SIMULATION_TYPE_f,
-                               realw* b_deltatover2_F,
-                               int* NCHUNKS_VAL) {
-  TRACE("kernel_3_a_cuda");
+void FC_FUNC_(update_accel_3_a_cuda,
+              UPDATE_ACCEL_3_A_CUDA)(long* Mesh_pointer,
+                                     realw* deltatover2_F,
+                                     int* NCHUNKS_VAL,
+                                     int* FORWARD_OR_ADJOINT) {
+  TRACE("update_accel_3_a_cuda");
 
   Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper
 
-  int SIMULATION_TYPE = *SIMULATION_TYPE_f;
-
   realw deltatover2 = *deltatover2_F;
-  realw b_deltatover2 = *b_deltatover2_F;
 
   int blocksize = BLOCKSIZE_KERNEL3;
   int size_padded = ((int)ceil(((double)mp->NGLOB_CRUST_MANTLE)/((double)blocksize)))*blocksize;
@@ -386,40 +364,42 @@
     // updates both, accel and veloc
 
     if( *NCHUNKS_VAL != 6 && mp->absorbing_conditions){
-      kernel_3_cuda_device<<< grid, threads>>>(mp->d_veloc_crust_mantle,
-                                               mp->d_accel_crust_mantle,
-                                               mp->NGLOB_CRUST_MANTLE,
-                                               deltatover2,
-                                               mp->two_omega_earth,
-                                               mp->d_rmassx_crust_mantle,
-                                               mp->d_rmassy_crust_mantle,
-                                               mp->d_rmassz_crust_mantle);
-
-      if(SIMULATION_TYPE == 3){
+      // uses corrected mass matrices rmassx,rmassy,rmassz
+      if( *FORWARD_OR_ADJOINT == 1 ){
+        kernel_3_cuda_device<<< grid, threads>>>(mp->d_veloc_crust_mantle,
+                                                 mp->d_accel_crust_mantle,
+                                                 mp->NGLOB_CRUST_MANTLE,
+                                                 deltatover2,
+                                                 mp->two_omega_earth,
+                                                 mp->d_rmassx_crust_mantle,
+                                                 mp->d_rmassy_crust_mantle,
+                                                 mp->d_rmassz_crust_mantle);
+      }else if( *FORWARD_OR_ADJOINT == 3 ){
         kernel_3_cuda_device<<< grid, threads>>>(mp->d_b_veloc_crust_mantle,
                                                  mp->d_b_accel_crust_mantle,
                                                  mp->NGLOB_CRUST_MANTLE,
-                                                 b_deltatover2,
+                                                 deltatover2,
                                                  mp->b_two_omega_earth,
                                                  mp->d_rmassx_crust_mantle,
                                                  mp->d_rmassy_crust_mantle,
                                                  mp->d_rmassz_crust_mantle);
       }
     }else{
-      kernel_3_cuda_device<<< grid, threads>>>(mp->d_veloc_crust_mantle,
-                                               mp->d_accel_crust_mantle,
-                                               mp->NGLOB_CRUST_MANTLE,
-                                               deltatover2,
-                                               mp->two_omega_earth,
-                                               mp->d_rmassz_crust_mantle,
-                                               mp->d_rmassz_crust_mantle,
-                                               mp->d_rmassz_crust_mantle);
-
-      if(SIMULATION_TYPE == 3){
+      // uses only rmassz
+      if( *FORWARD_OR_ADJOINT == 1 ){
+        kernel_3_cuda_device<<< grid, threads>>>(mp->d_veloc_crust_mantle,
+                                                 mp->d_accel_crust_mantle,
+                                                 mp->NGLOB_CRUST_MANTLE,
+                                                 deltatover2,
+                                                 mp->two_omega_earth,
+                                                 mp->d_rmassz_crust_mantle,
+                                                 mp->d_rmassz_crust_mantle,
+                                                 mp->d_rmassz_crust_mantle);
+      }else if( *FORWARD_OR_ADJOINT == 3 ){
         kernel_3_cuda_device<<< grid, threads>>>(mp->d_b_veloc_crust_mantle,
                                                  mp->d_b_accel_crust_mantle,
                                                  mp->NGLOB_CRUST_MANTLE,
-                                                 b_deltatover2,
+                                                 deltatover2,
                                                  mp->b_two_omega_earth,
                                                  mp->d_rmassz_crust_mantle,
                                                  mp->d_rmassz_crust_mantle,
@@ -431,15 +411,16 @@
     // updates only accel
 
     if( *NCHUNKS_VAL != 6 && mp->absorbing_conditions){
-      kernel_3_accel_cuda_device<<< grid, threads>>>(mp->d_accel_crust_mantle,
-                                                     mp->d_veloc_crust_mantle,
-                                                     mp->NGLOB_CRUST_MANTLE,
-                                                     mp->two_omega_earth,
-                                                     mp->d_rmassx_crust_mantle,
-                                                     mp->d_rmassy_crust_mantle,
-                                                     mp->d_rmassz_crust_mantle);
-
-      if(SIMULATION_TYPE == 3) {
+      // uses corrected mass matrices
+      if( *FORWARD_OR_ADJOINT == 1 ){
+        kernel_3_accel_cuda_device<<< grid, threads>>>(mp->d_accel_crust_mantle,
+                                                       mp->d_veloc_crust_mantle,
+                                                       mp->NGLOB_CRUST_MANTLE,
+                                                       mp->two_omega_earth,
+                                                       mp->d_rmassx_crust_mantle,
+                                                       mp->d_rmassy_crust_mantle,
+                                                       mp->d_rmassz_crust_mantle);
+      }else if( *FORWARD_OR_ADJOINT == 3 ){
         kernel_3_accel_cuda_device<<< grid, threads>>>(mp->d_b_accel_crust_mantle,
                                                        mp->d_b_veloc_crust_mantle,
                                                        mp->NGLOB_CRUST_MANTLE,
@@ -449,15 +430,16 @@
                                                        mp->d_rmassz_crust_mantle);
       }
     }else{
-      kernel_3_accel_cuda_device<<< grid, threads>>>(mp->d_accel_crust_mantle,
-                                                     mp->d_veloc_crust_mantle,
-                                                     mp->NGLOB_CRUST_MANTLE,
-                                                     mp->two_omega_earth,
-                                                     mp->d_rmassz_crust_mantle,
-                                                     mp->d_rmassz_crust_mantle,
-                                                     mp->d_rmassz_crust_mantle);
-
-      if(SIMULATION_TYPE == 3) {
+      // uses only rmassz
+      if( *FORWARD_OR_ADJOINT == 1 ){
+        kernel_3_accel_cuda_device<<< grid, threads>>>(mp->d_accel_crust_mantle,
+                                                       mp->d_veloc_crust_mantle,
+                                                       mp->NGLOB_CRUST_MANTLE,
+                                                       mp->two_omega_earth,
+                                                       mp->d_rmassz_crust_mantle,
+                                                       mp->d_rmassz_crust_mantle,
+                                                       mp->d_rmassz_crust_mantle);
+      }else if( *FORWARD_OR_ADJOINT == 3 ){
         kernel_3_accel_cuda_device<<< grid, threads>>>(mp->d_b_accel_crust_mantle,
                                                        mp->d_b_veloc_crust_mantle,
                                                        mp->NGLOB_CRUST_MANTLE,
@@ -471,26 +453,23 @@
 
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
   //printf("checking updatedispl_kernel launch...with %dx%d blocks\n",num_blocks_x,num_blocks_y);
-  exit_on_cuda_error("after kernel_3_a");
+  exit_on_cuda_error("after update_accel_3_a");
 #endif
 }
 
 /* ----------------------------------------------------------------------------------------------- */
 
 extern "C"
-void FC_FUNC_(kernel_3_b_cuda,
-              KERNEL_3_B_CUDA)(long* Mesh_pointer,
-                               realw* deltatover2_F,
-                               int* SIMULATION_TYPE_f,
-                               realw* b_deltatover2_F) {
-  TRACE("kernel_3_b_cuda");
+void FC_FUNC_(update_veloc_3_b_cuda,
+              UPDATE_VELOC_3_B_CUDA)(long* Mesh_pointer,
+                                     realw* deltatover2_F,
+                                     int* FORWARD_OR_ADJOINT) {
+  TRACE("update_veloc_3_b_cuda");
   int size_padded,num_blocks_x,num_blocks_y;
 
   Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper
 
-  int SIMULATION_TYPE = *SIMULATION_TYPE_f;
   realw deltatover2 = *deltatover2_F;
-  realw b_deltatover2 = *b_deltatover2_F;
 
   int blocksize = BLOCKSIZE_KERNEL3;
 
@@ -508,16 +487,16 @@
     dim3 threads1(blocksize,1,1);
 
     // updates only veloc at this point
-    kernel_3_veloc_cuda_device<<< grid1, threads1>>>(mp->d_veloc_crust_mantle,
-                                                     mp->d_accel_crust_mantle,
-                                                     mp->NGLOB_CRUST_MANTLE,
-                                                     deltatover2);
-
-    if(SIMULATION_TYPE == 3) {
+    if( *FORWARD_OR_ADJOINT == 1 ){
+      kernel_3_veloc_cuda_device<<< grid1, threads1>>>(mp->d_veloc_crust_mantle,
+                                                       mp->d_accel_crust_mantle,
+                                                       mp->NGLOB_CRUST_MANTLE,
+                                                       deltatover2);
+    }else if( *FORWARD_OR_ADJOINT == 3 ){
       kernel_3_veloc_cuda_device<<< grid1, threads1>>>(mp->d_b_veloc_crust_mantle,
                                                        mp->d_b_accel_crust_mantle,
                                                        mp->NGLOB_CRUST_MANTLE,
-                                                       b_deltatover2);
+                                                       deltatover2);
     }
   }
 
@@ -533,30 +512,29 @@
   dim3 threads(blocksize,1,1);
 
   // updates both, accel and veloc
-  kernel_3_cuda_device<<< grid, threads>>>(mp->d_veloc_inner_core,
-                                           mp->d_accel_inner_core,
-                                           mp->NGLOB_INNER_CORE,
-                                           deltatover2,
-                                           mp->two_omega_earth,
-                                           mp->d_rmass_inner_core,
-                                           mp->d_rmass_inner_core,
-                                           mp->d_rmass_inner_core);
-
-  if(SIMULATION_TYPE == 3) {
+  if( *FORWARD_OR_ADJOINT == 1 ){
+    kernel_3_cuda_device<<< grid, threads>>>(mp->d_veloc_inner_core,
+                                             mp->d_accel_inner_core,
+                                             mp->NGLOB_INNER_CORE,
+                                             deltatover2,
+                                             mp->two_omega_earth,
+                                             mp->d_rmass_inner_core,
+                                             mp->d_rmass_inner_core,
+                                             mp->d_rmass_inner_core);
+  }else if( *FORWARD_OR_ADJOINT == 3 ){
     kernel_3_cuda_device<<< grid, threads>>>(mp->d_b_veloc_inner_core,
                                              mp->d_b_accel_inner_core,
                                              mp->NGLOB_INNER_CORE,
-                                             b_deltatover2,
+                                             deltatover2,
                                              mp->b_two_omega_earth,
                                              mp->d_rmass_inner_core,
                                              mp->d_rmass_inner_core,
                                              mp->d_rmass_inner_core);
   }
 
-
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
   //printf("checking updatedispl_kernel launch...with %dx%d blocks\n",num_blocks_x,num_blocks_y);
-  exit_on_cuda_error("after kernel_3_b");
+  exit_on_cuda_error("after update_veloc_3_b");
 #endif
 }
 
@@ -593,16 +571,13 @@
 void FC_FUNC_(kernel_3_outer_core_cuda,
               KERNEL_3_OUTER_CORE_CUDA)(long* Mesh_pointer,
                                         realw* deltatover2_F,
-                                        int* SIMULATION_TYPE_f,
-                                        realw* b_deltatover2_F) {
+                                        int* FORWARD_OR_ADJOINT) {
 
   TRACE("kernel_3_outer_core_cuda");
 
   Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper
 
-  int SIMULATION_TYPE = *SIMULATION_TYPE_f;
   realw deltatover2 = *deltatover2_F;
-  realw b_deltatover2 = *b_deltatover2_F;
 
   int blocksize = BLOCKSIZE_KERNEL3;
 
@@ -616,16 +591,16 @@
   dim3 grid(num_blocks_x,num_blocks_y);
   dim3 threads(blocksize,1,1);
 
-  kernel_3_outer_core_cuda_device<<< grid, threads>>>(mp->d_veloc_outer_core,
-                                                      mp->d_accel_outer_core,
-                                                      mp->NGLOB_OUTER_CORE,
-                                                      deltatover2,mp->d_rmass_outer_core);
-
-  if(SIMULATION_TYPE == 3) {
+  if( *FORWARD_OR_ADJOINT == 1 ){
+    kernel_3_outer_core_cuda_device<<< grid, threads>>>(mp->d_veloc_outer_core,
+                                                        mp->d_accel_outer_core,
+                                                        mp->NGLOB_OUTER_CORE,
+                                                        deltatover2,mp->d_rmass_outer_core);
+  }else if( *FORWARD_OR_ADJOINT == 3){
     kernel_3_outer_core_cuda_device<<< grid, threads>>>(mp->d_b_veloc_outer_core,
                                                         mp->d_b_accel_outer_core,
                                                         mp->NGLOB_OUTER_CORE,
-                                                        b_deltatover2,mp->d_rmass_outer_core);
+                                                        deltatover2,mp->d_rmass_outer_core);
   }
 
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/specfem3D_gpu_cuda_method_stubs.c
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/specfem3D_gpu_cuda_method_stubs.c	2013-08-30 00:30:39 UTC (rev 22745)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/specfem3D_gpu_cuda_method_stubs.c	2013-08-30 12:45:44 UTC (rev 22746)
@@ -155,16 +155,20 @@
 //
 
 void FC_FUNC_(compute_coupling_fluid_cmb_cuda,
-              COMPUTE_COUPLING_FLUID_CMB_CUDA)(long* Mesh_pointer_f) {} 
+              COMPUTE_COUPLING_FLUID_CMB_CUDA)(long* Mesh_pointer_f,
+                                               int* FORWARD_OR_ADJOINT) {} 
 
 void FC_FUNC_(compute_coupling_fluid_icb_cuda,
-              COMPUTE_COUPLING_FLUID_ICB_CUDA)(long* Mesh_pointer_f) {} 
+              COMPUTE_COUPLING_FLUID_ICB_CUDA)(long* Mesh_pointer_f,
+                                               int* FORWARD_OR_ADJOINT) {} 
 
 void FC_FUNC_(compute_coupling_cmb_fluid_cuda,
-              COMPUTE_COUPLING_CMB_FLUID_CUDA)(long* Mesh_pointer_f) {} 
+              COMPUTE_COUPLING_CMB_FLUID_CUDA)(long* Mesh_pointer_f,
+                                               int* FORWARD_OR_ADJOINT) {} 
 
 void FC_FUNC_(compute_coupling_icb_fluid_cuda,
-              COMPUTE_COUPLING_ICB_FLUID_CUDA)(long* Mesh_pointer_f) {} 
+              COMPUTE_COUPLING_ICB_FLUID_CUDA)(long* Mesh_pointer_f,
+                                               int* FORWARD_OR_ADJOINT) {} 
 
 void FC_FUNC_(compute_coupling_ocean_cuda,
               COMPUTE_COUPLING_OCEAN_CUDA)(long* Mesh_pointer_f,
@@ -200,7 +204,7 @@
               COMPUTE_FORCES_OUTER_CORE_CUDA)(long* Mesh_pointer_f,
                                               int* iphase,
                                               realw* time_f,
-                                              realw* b_time_f) {} 
+                                              int* FORWARD_OR_ADJOINT) {} 
 
 
 //
@@ -235,17 +239,27 @@
                                             realw* absorb_potential,
                                             int* itype) {} 
 
+void FC_FUNC_(compute_stacey_acoustic_backward_cuda,
+              COMPUTE_STACEY_ACOUSTIC_BACKWARD_CUDA)(long* Mesh_pointer_f,
+                                                     realw* absorb_potential,
+                                                     int* itype) {} 
 
+
 //
 // src/cuda/compute_stacey_elastic_cuda.cu
 //
 
 void FC_FUNC_(compute_stacey_elastic_cuda,
               COMPUTE_STACEY_ELASTIC_CUDA)(long* Mesh_pointer_f,
-                                                realw* absorb_field,
-                                                int* itype) {} 
+                                           realw* absorb_field,
+                                           int* itype) {} 
 
+void FC_FUNC_(compute_stacey_elastic_backward_cuda,
+              COMPUTE_STACEY_ELASTIC_BACKWARD_CUDA)(long* Mesh_pointer_f,
+                                                    realw* absorb_field,
+                                                    int* itype) {} 
 
+
 //
 // src/cuda/initialize_cuda.cu
 //
@@ -266,46 +280,37 @@
                                              realw* deltat_F,
                                              realw* deltatsqover2_F,
                                              realw* deltatover2_F,
-                                             realw* b_deltat_F,
-                                             realw* b_deltatsqover2_F,
-                                             realw* b_deltatover2_F) {} 
+                                             int* FORWARD_OR_ADJOINT) {} 
 
 void FC_FUNC_(it_update_displacement_cm_cuda,
               IT_UPDATE_DISPLACMENT_CM_CUDA)(long* Mesh_pointer_f,
                                              realw* deltat_F,
                                              realw* deltatsqover2_F,
                                              realw* deltatover2_F,
-                                             realw* b_deltat_F,
-                                             realw* b_deltatsqover2_F,
-                                             realw* b_deltatover2_F) {} 
+                                             int* FORWARD_OR_ADJOINT) {} 
 
 void FC_FUNC_(it_update_displacement_oc_cuda,
               IT_UPDATE_DISPLACEMENT_OC_cuda)(long* Mesh_pointer_f,
-                                               realw* deltat_F,
-                                               realw* deltatsqover2_F,
-                                               realw* deltatover2_F,
-                                               realw* b_deltat_F,
-                                               realw* b_deltatsqover2_F,
-                                               realw* b_deltatover2_F) {} 
+                                              realw* deltat_F,
+                                              realw* deltatsqover2_F,
+                                              realw* deltatover2_F,
+                                              int* FORWARD_OR_ADJOINT) {} 
 
-void FC_FUNC_(kernel_3_a_cuda,
-              KERNEL_3_A_CUDA)(long* Mesh_pointer,
-                               realw* deltatover2_F,
-                               int* SIMULATION_TYPE_f,
-                               realw* b_deltatover2_F,
-                               int* NCHUNKS_VAL) {} 
+void FC_FUNC_(update_accel_3_a_cuda,
+              UPDATE_ACCEL_3_A_CUDA)(long* Mesh_pointer,
+                                     realw* deltatover2_F,
+                                     int* NCHUNKS_VAL,
+                                     int* FORWARD_OR_ADJOINT) {} 
 
-void FC_FUNC_(kernel_3_b_cuda,
-              KERNEL_3_B_CUDA)(long* Mesh_pointer,
-                               realw* deltatover2_F,
-                               int* SIMULATION_TYPE_f,
-                               realw* b_deltatover2_F) {} 
+void FC_FUNC_(update_veloc_3_b_cuda,
+              UPDATE_VELOC_3_B_CUDA)(long* Mesh_pointer,
+                                     realw* deltatover2_F,
+                                     int* FORWARD_OR_ADJOINT) {} 
 
 void FC_FUNC_(kernel_3_outer_core_cuda,
               KERNEL_3_OUTER_CORE_CUDA)(long* Mesh_pointer,
                                         realw* deltatover2_F,
-                                        int* SIMULATION_TYPE_f,
-                                        realw* b_deltatover2_F) {} 
+                                        int* FORWARD_OR_ADJOINT) {} 
 
 
 //

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/shared/parallel.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/shared/parallel.f90	2013-08-30 00:30:39 UTC (rev 22745)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/shared/parallel.f90	2013-08-30 12:45:44 UTC (rev 22746)
@@ -591,6 +591,28 @@
 !
 
 
+  subroutine recv_singlel(recvbuf, dest, recvtag)
+
+  use mpi
+
+  implicit none
+
+  integer :: dest,recvtag
+  logical :: recvbuf
+
+  ! MPI status of messages to be received
+  integer :: msg_status(MPI_STATUS_SIZE)
+  integer :: ier
+
+  call MPI_RECV(recvbuf,1,MPI_LOGICAL,dest,recvtag,MPI_COMM_WORLD,msg_status,ier)
+
+  end subroutine recv_singlel
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+
   subroutine recv_i(recvbuf, recvcount, dest, recvtag)
 
   use mpi
@@ -672,6 +694,7 @@
   integer :: dest,sendtag
   integer :: sendcount
   integer,dimension(sendcount):: sendbuf
+
   integer :: ier
 
   call MPI_SEND(sendbuf,sendcount,MPI_INTEGER,dest,sendtag,MPI_COMM_WORLD,ier)
@@ -690,13 +713,32 @@
 
   integer :: dest,sendtag
   integer :: sendbuf
+
   integer :: ier
 
   call MPI_SEND(sendbuf,1,MPI_INTEGER,dest,sendtag,MPI_COMM_WORLD,ier)
 
   end subroutine send_singlei
 
+!
+!-------------------------------------------------------------------------------------------------
+!
 
+  subroutine send_singlel(sendbuf, dest, sendtag)
+
+  use mpi
+
+  implicit none
+
+  integer :: dest,sendtag
+  logical :: sendbuf
+
+  integer :: ier
+
+  call MPI_SEND(sendbuf,1,MPI_LOGICAL,dest,sendtag,MPI_COMM_WORLD,ier)
+
+  end subroutine send_singlel
+
 !
 !-------------------------------------------------------------------------------------------------
 !

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/shared/read_parameter_file.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/shared/read_parameter_file.f90	2013-08-30 00:30:39 UTC (rev 22745)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/shared/read_parameter_file.f90	2013-08-30 12:45:44 UTC (rev 22746)
@@ -114,6 +114,8 @@
   if (ierr /= 0) stop 'an error occurred while reading the parameter file: MODEL'
   call read_value_double_precision(RECORD_LENGTH_IN_MINUTES, 'solver.RECORD_LENGTH_IN_MINUTES', ierr)
   if (ierr /= 0) stop 'an error occurred while reading the parameter file: RECORD_LENGTH_IN_MINUTES'
+
+  ! attenuation parameters
   call read_value_logical(ATTENUATION_1D_WITH_3D_STORAGE, 'solver.ATTENUATION_1D_WITH_3D_STORAGE', ierr)
   if (ierr /= 0) stop 'an error occurred while reading the parameter file: ATTENUATION_1D_WITH_3D_STORAGE'
   call read_value_logical(PARTIAL_PHYS_DISPERSION_ONLY, 'solver.PARTIAL_PHYS_DISPERSION_ONLY', ierr)
@@ -123,12 +125,10 @@
   call read_value_integer(NT_DUMP_ATTENUATION, 'solver.NT_DUMP_ATTENUATION', ierr)
   if (ierr /= 0) stop 'an error occurred while reading the parameter file: NT_DUMP_ATTENUATION'
 
+  ! mass matrix corrections
   call read_value_logical(EXACT_MASS_MATRIX_FOR_ROTATION, 'solver.EXACT_MASS_MATRIX_FOR_ROTATION', ierr)
   if (ierr /= 0) stop 'an error occurred while reading the parameter file: EXACT_MASS_MATRIX_FOR_ROTATION'
 
-  ! ignore EXACT_MASS_MATRIX_FOR_ROTATION if rotation is not included in the simulations
-  if(.not. ROTATION) EXACT_MASS_MATRIX_FOR_ROTATION = .false.
-
   ! low-memory runge-kutta time scheme
   call read_value_logical(USE_LDDRK, 'solver.USE_LDDRK', ierr)
   if (ierr /= 0) stop 'an error occurred while reading the parameter file: USE_LDDRK'
@@ -175,11 +175,13 @@
   call read_value_integer(NUMBER_OF_THIS_RUN, 'solver.NUMBER_OF_THIS_RUN', ierr)
   if (ierr /= 0) stop 'an error occurred while reading the parameter file: NUMBER_OF_THIS_RUN'
 
+  ! data file output directories
   call read_value_string(LOCAL_PATH, 'LOCAL_PATH', ierr)
   if (ierr /= 0) stop 'an error occurred while reading the parameter file: LOCAL_PATH'
   call read_value_string(LOCAL_TMP_PATH, 'LOCAL_TMP_PATH', ierr)
   if (ierr /= 0) stop 'an error occurred while reading the parameter file: LOCAL_TMP_PATH'
 
+  ! user output
   call read_value_integer(NTSTEP_BETWEEN_OUTPUT_INFO, 'solver.NTSTEP_BETWEEN_OUTPUT_INFO', ierr)
   if (ierr /= 0) stop 'an error occurred while reading the parameter file: NTSTEP_BETWEEN_OUTPUT_INFO'
   call read_value_integer(NTSTEP_BETWEEN_OUTPUT_SEISMOS, 'solver.NTSTEP_BETWEEN_OUTPUT_SEISMOS', ierr)
@@ -247,6 +249,29 @@
     stop 'an error occurred while reading the parameter file'
   endif
 
+  ! ignore EXACT_MASS_MATRIX_FOR_ROTATION if rotation is not included in the simulations
+  if(.not. ROTATION) EXACT_MASS_MATRIX_FOR_ROTATION = .false.
+
+  ! produces simulations compatible with old globe version 5.1.5
+  if( USE_VERSION_5_1_5 ) then
+    if( .not. ATTENUATION_1D_WITH_3D_STORAGE ) then
+      print*,'setting ATTENUATION_1D_WITH_3D_STORAGE to .true. for compatibility with globe version 5.1.5 '
+      ATTENUATION_1D_WITH_3D_STORAGE = .true.
+    endif
+    if( UNDO_ATTENUATION ) then
+      print*,'setting UNDO_ATTENUATION to .false. for compatibility with globe version 5.1.5 '
+      UNDO_ATTENUATION = .false.
+    endif
+    if( USE_LDDRK ) then
+      print*,'setting USE_LDDRK to .false. for compatibility with globe version 5.1.5 '
+      USE_LDDRK = .false.
+    endif
+    if( EXACT_MASS_MATRIX_FOR_ROTATION ) then
+      print*,'setting EXACT_MASS_MATRIX_FOR_ROTATION to .false. for compatibility with globe version 5.1.5 '
+      EXACT_MASS_MATRIX_FOR_ROTATION = .false.
+    endif
+  endif
+
 !daniel debug: status of implementation
 
 !! DK DK July 2013: temporary, the time for Matthieu Lefebvre to merge his ADIOS implementation

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_add_sources.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_add_sources.f90	2013-08-30 00:30:39 UTC (rev 22745)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_add_sources.f90	2013-08-30 12:45:44 UTC (rev 22746)
@@ -60,7 +60,12 @@
           f0 = hdur(isource) !! using hdur as a FREQUENCY just to avoid changing CMTSOLUTION file format
 
           ! This is the expression of a Ricker; should be changed according maybe to the Par_file.
-          stf_used = FACTOR_FORCE_SOURCE * comp_source_time_function_rickr(dble(it-1)*DT-t0-tshift_cmt(isource),f0)
+          if(USE_LDDRK)then
+            stf_used = FACTOR_FORCE_SOURCE * &
+                     comp_source_time_function_rickr(dble(it-1)*DT + dble(C_LDDRK(istage))*DT-t0-tshift_cmt(isource),f0)
+          else
+            stf_used = FACTOR_FORCE_SOURCE * comp_source_time_function_rickr(dble(it-1)*DT-t0-tshift_cmt(isource),f0)
+          endif
 
           ! we use a force in a single direction along one of the components:
           !  x/y/z or E/N/Z-direction would correspond to 1/2/3 = COMPONENT_FORCE_SOURCE
@@ -69,7 +74,12 @@
                            + sngl( nu_source(COMPONENT_FORCE_SOURCE,:,isource) ) * stf_used
 
         else
-          stf = comp_source_time_function(dble(it-1)*DT-t0-tshift_cmt(isource),hdur_gaussian(isource))
+          if(USE_LDDRK)then
+            stf = comp_source_time_function(dble(it-1)*DT + &
+                                            dble(C_LDDRK(istage))*DT-t0-tshift_cmt(isource),hdur_gaussian(isource))
+          else
+            stf = comp_source_time_function(dble(it-1)*DT-t0-tshift_cmt(isource),hdur_gaussian(isource))
+          endif
 
           !     distinguish between single and double precision for reals
           if(CUSTOM_REAL == SIZE_REAL) then
@@ -103,13 +113,23 @@
     ! prepares buffer with source time function values, to be copied onto GPU
     if(USE_FORCE_POINT_SOURCE) then
       do isource = 1,NSOURCES
-        stf_pre_compute(isource) = &
-          FACTOR_FORCE_SOURCE * comp_source_time_function_rickr(dble(it-1)*DT-t0-tshift_cmt(isource),f0)
+        if(USE_LDDRK)then
+          stf_pre_compute(isource) = FACTOR_FORCE_SOURCE * &
+                     comp_source_time_function_rickr(dble(it-1)*DT + dble(C_LDDRK(istage))*DT-t0-tshift_cmt(isource),f0)
+        else
+          stf_pre_compute(isource) = &
+                      FACTOR_FORCE_SOURCE * comp_source_time_function_rickr(dble(it-1)*DT-t0-tshift_cmt(isource),f0)
+        endif
       enddo
     else
       do isource = 1,NSOURCES
-        stf_pre_compute(isource) = &
-          comp_source_time_function(dble(it-1)*DT-t0-tshift_cmt(isource),hdur_gaussian(isource))
+        if(USE_LDDRK)then
+          stf_pre_compute(isource) = comp_source_time_function(dble(it-1)*DT + &
+                                            dble(C_LDDRK(istage))*DT-t0-tshift_cmt(isource),hdur_gaussian(isource))
+        else
+          stf_pre_compute(isource) = &
+            comp_source_time_function(dble(it-1)*DT-t0-tshift_cmt(isource),hdur_gaussian(isource))
+        endif
       enddo
     endif
     ! adds sources: only implements SIMTYPE=1 and NOISE_TOM=0

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_boundary_kernel.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_boundary_kernel.f90	2013-08-30 00:30:39 UTC (rev 22745)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_boundary_kernel.f90	2013-08-30 12:45:44 UTC (rev 22746)
@@ -138,7 +138,7 @@
 
   ! Moho
   if (.not. SUPPRESS_CRUSTAL_MESH .and. HONOR_1D_SPHERICAL_MOHO) then
-    call compute_boundary_kernel(displ_crust_mantle,accel_crust_mantle, &
+    call compute_boundary_kernel_depth(displ_crust_mantle,accel_crust_mantle, &
              b_displ_crust_mantle,nspec_crust_mantle,iregion_code, &
              ystore_crust_mantle,zstore_crust_mantle,ibool_crust_mantle,ispec_is_tiso_crust_mantle, &
              xix_crust_mantle,xiy_crust_mantle,xiz_crust_mantle, &
@@ -154,7 +154,7 @@
              c55store_crust_mantle,c56store_crust_mantle,c66store_crust_mantle, &
              k_top,ibelm_moho_top,normal_moho,moho_kl_top,fluid_solid_boundary,NSPEC2D_MOHO)
 
-    call compute_boundary_kernel(displ_crust_mantle,accel_crust_mantle, &
+    call compute_boundary_kernel_depth(displ_crust_mantle,accel_crust_mantle, &
              b_displ_crust_mantle,nspec_crust_mantle,iregion_code, &
              ystore_crust_mantle,zstore_crust_mantle,ibool_crust_mantle,ispec_is_tiso_crust_mantle, &
              xix_crust_mantle,xiy_crust_mantle,xiz_crust_mantle, &
@@ -174,7 +174,7 @@
   endif
 
   ! 400
-  call compute_boundary_kernel(displ_crust_mantle,accel_crust_mantle, &
+  call compute_boundary_kernel_depth(displ_crust_mantle,accel_crust_mantle, &
              b_displ_crust_mantle,nspec_crust_mantle,iregion_code, &
              ystore_crust_mantle,zstore_crust_mantle,ibool_crust_mantle,ispec_is_tiso_crust_mantle, &
              xix_crust_mantle,xiy_crust_mantle,xiz_crust_mantle, &
@@ -190,7 +190,7 @@
              c55store_crust_mantle,c56store_crust_mantle,c66store_crust_mantle, &
              k_top,ibelm_400_top,normal_400,d400_kl_top,fluid_solid_boundary,NSPEC2D_400)
 
-  call compute_boundary_kernel(displ_crust_mantle,accel_crust_mantle, &
+  call compute_boundary_kernel_depth(displ_crust_mantle,accel_crust_mantle, &
              b_displ_crust_mantle,nspec_crust_mantle,iregion_code, &
              ystore_crust_mantle,zstore_crust_mantle,ibool_crust_mantle,ispec_is_tiso_crust_mantle, &
              xix_crust_mantle,xiy_crust_mantle,xiz_crust_mantle, &
@@ -209,7 +209,7 @@
   d400_kl = d400_kl + (d400_kl_top - d400_kl_bot) * deltat
 
   ! 670
-  call compute_boundary_kernel(displ_crust_mantle,accel_crust_mantle, &
+  call compute_boundary_kernel_depth(displ_crust_mantle,accel_crust_mantle, &
              b_displ_crust_mantle,nspec_crust_mantle,iregion_code, &
              ystore_crust_mantle,zstore_crust_mantle,ibool_crust_mantle,ispec_is_tiso_crust_mantle, &
              xix_crust_mantle,xiy_crust_mantle,xiz_crust_mantle, &
@@ -225,7 +225,7 @@
              c55store_crust_mantle,c56store_crust_mantle,c66store_crust_mantle, &
              k_top,ibelm_670_top,normal_670,d670_kl_top,fluid_solid_boundary,NSPEC2D_670)
 
-  call compute_boundary_kernel(displ_crust_mantle,accel_crust_mantle, &
+  call compute_boundary_kernel_depth(displ_crust_mantle,accel_crust_mantle, &
              b_displ_crust_mantle,nspec_crust_mantle,iregion_code, &
              ystore_crust_mantle,zstore_crust_mantle,ibool_crust_mantle,ispec_is_tiso_crust_mantle, &
              xix_crust_mantle,xiy_crust_mantle,xiz_crust_mantle, &
@@ -247,7 +247,7 @@
   fluid_solid_boundary = .true.
   iregion_code = IREGION_CRUST_MANTLE
 
-  call compute_boundary_kernel(displ_crust_mantle,accel_crust_mantle, &
+  call compute_boundary_kernel_depth(displ_crust_mantle,accel_crust_mantle, &
              b_displ_crust_mantle,nspec_crust_mantle,iregion_code, &
              ystore_crust_mantle,zstore_crust_mantle,ibool_crust_mantle,ispec_is_tiso_crust_mantle, &
              xix_crust_mantle,xiy_crust_mantle,xiz_crust_mantle, &
@@ -270,7 +270,7 @@
   allocate(dummy_ispec_is_tiso(NSPEC_OUTER_CORE))
   dummy_ispec_is_tiso(:) = .false.
 
-  call compute_boundary_kernel(vector_displ_outer_core,vector_accel_outer_core, &
+  call compute_boundary_kernel_depth(vector_displ_outer_core,vector_accel_outer_core, &
              b_vector_displ_outer_core,nspec_outer_core, &
              iregion_code,ystore_outer_core,zstore_outer_core,ibool_outer_core,dummy_ispec_is_tiso, &
              xix_outer_core,xiy_outer_core,xiz_outer_core, &
@@ -291,7 +291,7 @@
 
   ! ICB
   fluid_solid_boundary = .true.
-  call compute_boundary_kernel(vector_displ_outer_core,vector_accel_outer_core, &
+  call compute_boundary_kernel_depth(vector_displ_outer_core,vector_accel_outer_core, &
              b_vector_displ_outer_core,nspec_outer_core, &
              iregion_code,ystore_outer_core,zstore_outer_core,ibool_outer_core,dummy_ispec_is_tiso, &
              xix_outer_core,xiy_outer_core,xiz_outer_core, &
@@ -317,7 +317,7 @@
   allocate(dummy_ispec_is_tiso(NSPEC_INNER_CORE))
   dummy_ispec_is_tiso(:) = .false.
 
-  call compute_boundary_kernel(displ_inner_core,accel_inner_core, &
+  call compute_boundary_kernel_depth(displ_inner_core,accel_inner_core, &
              b_displ_inner_core,nspec_inner_core,iregion_code, &
              ystore_inner_core,zstore_inner_core,ibool_inner_core,dummy_ispec_is_tiso, &
              xix_inner_core,xiy_inner_core,xiz_inner_core, &
@@ -343,15 +343,15 @@
 !-------------------------------------------------------------------------------------------------
 !
 
-  subroutine compute_boundary_kernel(displ,accel,b_displ,nspec,iregion_code, &
-           ystore,zstore,ibool,ispec_is_tiso, &
-           xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
-           hprime_xx,hprime_yy,hprime_zz, &
-           rhostore,kappavstore,muvstore,kappahstore,muhstore,eta_anisostore, &
-           c11store,c12store,c13store,c14store,c15store,c16store,c22store, &
-           c23store,c24store,c25store,c26store,c33store,c34store,c35store, &
-           c36store,c44store,c45store,c46store,c55store,c56store,c66store, &
-           k_disc,ibelm_disc,normal_disc,b_kl,fluid_solid_boundary,NSPEC2D_DISC)
+  subroutine compute_boundary_kernel_depth(displ,accel,b_displ,nspec,iregion_code, &
+                                           ystore,zstore,ibool,ispec_is_tiso, &
+                                           xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
+                                           hprime_xx,hprime_yy,hprime_zz, &
+                                           rhostore,kappavstore,muvstore,kappahstore,muhstore,eta_anisostore, &
+                                           c11store,c12store,c13store,c14store,c15store,c16store,c22store, &
+                                           c23store,c24store,c25store,c26store,c33store,c34store,c35store, &
+                                           c36store,c44store,c45store,c46store,c55store,c56store,c66store, &
+                                           k_disc,ibelm_disc,normal_disc,b_kl,fluid_solid_boundary,NSPEC2D_DISC)
 
   use constants
 
@@ -543,7 +543,7 @@
 
     enddo
 
-  end subroutine compute_boundary_kernel
+  end subroutine compute_boundary_kernel_depth
 
 
 ! ==========================================================================================

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_coupling.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_coupling.f90	2013-08-30 00:30:39 UTC (rev 22745)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_coupling.f90	2013-08-30 12:45:44 UTC (rev 22746)
@@ -25,12 +25,12 @@
 !
 !=====================================================================
 
-  subroutine compute_coupling_fluid_CMB(displ_crust_mantle,b_displ_crust_mantle, &
+  subroutine compute_coupling_fluid_CMB(displ_crust_mantle, &
                                        ibool_crust_mantle,ibelm_bottom_crust_mantle,  &
-                                       accel_outer_core,b_accel_outer_core, &
+                                       accel_outer_core, &
                                        normal_top_outer_core,jacobian2D_top_outer_core, &
                                        wgllwgll_xy,ibool_outer_core,ibelm_top_outer_core, &
-                                       SIMULATION_TYPE,nspec2D_top)
+                                       nspec2D_top)
 
   use constants_solver
 
@@ -38,14 +38,11 @@
 
   real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_CRUST_MANTLE) :: &
     displ_crust_mantle
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_CRUST_MANTLE_ADJOINT) :: &
-    b_displ_crust_mantle
 
   integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE) :: ibool_crust_mantle
   integer, dimension(NSPEC2D_BOTTOM_CM) :: ibelm_bottom_crust_mantle
 
   real(kind=CUSTOM_REAL), dimension(NGLOB_OUTER_CORE) :: accel_outer_core
-  real(kind=CUSTOM_REAL), dimension(NGLOB_OUTER_CORE_ADJOINT) :: b_accel_outer_core
 
   real(kind=CUSTOM_REAL), dimension(NDIM,NGLLX,NGLLY,NSPEC2D_TOP_OC) :: normal_top_outer_core
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NSPEC2D_TOP_OC) :: jacobian2D_top_outer_core
@@ -54,8 +51,7 @@
   integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_OUTER_CORE) :: ibool_outer_core
   integer, dimension(NSPEC2D_TOP_OC) :: ibelm_top_outer_core
 
-  integer SIMULATION_TYPE
-  integer nspec2D_top
+  integer :: nspec2D_top
 
   ! local parameters
   real(kind=CUSTOM_REAL) :: displ_x,displ_y,displ_z,displ_n,nx,ny,nz,weight
@@ -98,21 +94,6 @@
 
         ! update fluid acceleration/pressure
         accel_outer_core(iglob_oc) = accel_outer_core(iglob_oc) + weight*displ_n
-
-        if (SIMULATION_TYPE == 3) then
-          ! get displacement in crust mantle
-          iglob_cm = ibool_crust_mantle(i,j,k_corresp,ispec_selected)
-          displ_x = b_displ_crust_mantle(1,iglob_cm)
-          displ_y = b_displ_crust_mantle(2,iglob_cm)
-          displ_z = b_displ_crust_mantle(3,iglob_cm)
-
-          displ_n = displ_x*nx + displ_y*ny + displ_z*nz
-
-          ! update fluid acceleration/pressure
-          iglob_oc = ibool_outer_core(i,j,k,ispec)
-          b_accel_outer_core(iglob_oc) = b_accel_outer_core(iglob_oc) + weight*displ_n
-        endif
-
       enddo
     enddo
   enddo
@@ -123,12 +104,12 @@
 !-------------------------------------------------------------------------------------------------
 !
 
-  subroutine compute_coupling_fluid_ICB(displ_inner_core,b_displ_inner_core, &
-                            ibool_inner_core,ibelm_top_inner_core,  &
-                            accel_outer_core,b_accel_outer_core, &
-                            normal_bottom_outer_core,jacobian2D_bottom_outer_core, &
-                            wgllwgll_xy,ibool_outer_core,ibelm_bottom_outer_core, &
-                            SIMULATION_TYPE,nspec_bottom)
+  subroutine compute_coupling_fluid_ICB(displ_inner_core, &
+                                        ibool_inner_core,ibelm_top_inner_core,  &
+                                        accel_outer_core, &
+                                        normal_bottom_outer_core,jacobian2D_bottom_outer_core, &
+                                        wgllwgll_xy,ibool_outer_core,ibelm_bottom_outer_core, &
+                                        nspec_bottom)
 
   use constants_solver
 
@@ -136,14 +117,11 @@
 
   real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_INNER_CORE) :: &
     displ_inner_core
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_INNER_CORE_ADJOINT) :: &
-    b_displ_inner_core
 
   integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE) :: ibool_inner_core
   integer, dimension(NSPEC2D_TOP_IC) :: ibelm_top_inner_core
 
   real(kind=CUSTOM_REAL), dimension(NGLOB_OUTER_CORE) :: accel_outer_core
-  real(kind=CUSTOM_REAL), dimension(NGLOB_OUTER_CORE_ADJOINT) :: b_accel_outer_core
 
   real(kind=CUSTOM_REAL), dimension(NDIM,NGLLX,NGLLY,NSPEC2D_BOTTOM_OC) :: normal_bottom_outer_core
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NSPEC2D_BOTTOM_OC) :: jacobian2D_bottom_outer_core
@@ -152,8 +130,7 @@
   integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_OUTER_CORE) :: ibool_outer_core
   integer, dimension(NSPEC2D_BOTTOM_OC) :: ibelm_bottom_outer_core
 
-  integer SIMULATION_TYPE
-  integer nspec_bottom
+  integer :: nspec_bottom
 
   ! local parameters
   real(kind=CUSTOM_REAL) :: displ_x,displ_y,displ_z,displ_n,nx,ny,nz,weight
@@ -196,22 +173,6 @@
 
         ! update fluid acceleration/pressure
         accel_outer_core(iglob_oc) = accel_outer_core(iglob_oc) - weight*displ_n
-
-        if (SIMULATION_TYPE == 3) then
-          ! get displacement in inner core
-          iglob_ic = ibool_inner_core(i,j,k_corresp,ispec_selected)
-          displ_x = b_displ_inner_core(1,iglob_ic)
-          displ_y = b_displ_inner_core(2,iglob_ic)
-          displ_z = b_displ_inner_core(3,iglob_ic)
-
-          displ_n = displ_x*nx + displ_y*ny + displ_z*nz
-
-          ! update fluid acceleration/pressure
-          iglob_oc = ibool_outer_core(i,j,k,ispec)
-          b_accel_outer_core(iglob_oc) = b_accel_outer_core(iglob_oc) - weight*displ_n
-
-        endif
-
       enddo
     enddo
   enddo
@@ -222,14 +183,14 @@
 !-------------------------------------------------------------------------------------------------
 !
 
-  subroutine compute_coupling_CMB_fluid(displ_crust_mantle,b_displ_crust_mantle, &
-                            accel_crust_mantle,b_accel_crust_mantle, &
-                            ibool_crust_mantle,ibelm_bottom_crust_mantle,  &
-                            accel_outer_core,b_accel_outer_core, &
-                            normal_top_outer_core,jacobian2D_top_outer_core, &
-                            wgllwgll_xy,ibool_outer_core,ibelm_top_outer_core, &
-                            RHO_TOP_OC,minus_g_cmb, &
-                            SIMULATION_TYPE,nspec_bottom)
+  subroutine compute_coupling_CMB_fluid(displ_crust_mantle, &
+                                        accel_crust_mantle, &
+                                        ibool_crust_mantle,ibelm_bottom_crust_mantle,  &
+                                        accel_outer_core, &
+                                        normal_top_outer_core,jacobian2D_top_outer_core, &
+                                        wgllwgll_xy,ibool_outer_core,ibelm_top_outer_core, &
+                                        RHO_TOP_OC,minus_g_cmb, &
+                                        nspec_bottom)
 
   use constants_solver
 
@@ -237,14 +198,11 @@
 
   real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_CRUST_MANTLE) :: &
     displ_crust_mantle,accel_crust_mantle
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_CRUST_MANTLE_ADJOINT) :: &
-    b_displ_crust_mantle,b_accel_crust_mantle
 
   integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE) :: ibool_crust_mantle
   integer, dimension(NSPEC2D_BOTTOM_CM) :: ibelm_bottom_crust_mantle
 
   real(kind=CUSTOM_REAL), dimension(NGLOB_OUTER_CORE) :: accel_outer_core
-  real(kind=CUSTOM_REAL), dimension(NGLOB_OUTER_CORE_ADJOINT) :: b_accel_outer_core
 
   real(kind=CUSTOM_REAL), dimension(NDIM,NGLLX,NGLLY,NSPEC2D_TOP_OC) :: normal_top_outer_core
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NSPEC2D_TOP_OC) :: jacobian2D_top_outer_core
@@ -253,11 +211,10 @@
   integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_OUTER_CORE) :: ibool_outer_core
   integer, dimension(NSPEC2D_TOP_OC) :: ibelm_top_outer_core
 
-  double precision RHO_TOP_OC
-  real(kind=CUSTOM_REAL) minus_g_cmb
+  double precision :: RHO_TOP_OC
+  real(kind=CUSTOM_REAL) :: minus_g_cmb
 
-  integer SIMULATION_TYPE
-  integer nspec_bottom
+  integer :: nspec_bottom
 
   ! local parameters
   real(kind=CUSTOM_REAL) :: pressure,nx,ny,nz,weight
@@ -302,20 +259,6 @@
         accel_crust_mantle(1,iglob_mantle) = accel_crust_mantle(1,iglob_mantle) + weight*nx*pressure
         accel_crust_mantle(2,iglob_mantle) = accel_crust_mantle(2,iglob_mantle) + weight*ny*pressure
         accel_crust_mantle(3,iglob_mantle) = accel_crust_mantle(3,iglob_mantle) + weight*nz*pressure
-
-        if (SIMULATION_TYPE == 3) then
-          if(GRAVITY_VAL) then
-            pressure = RHO_TOP_OC * (- b_accel_outer_core(iglob_oc) &
-               + minus_g_cmb *(b_displ_crust_mantle(1,iglob_mantle)*nx &
-               + b_displ_crust_mantle(2,iglob_mantle)*ny + b_displ_crust_mantle(3,iglob_mantle)*nz))
-          else
-            pressure = - RHO_TOP_OC * b_accel_outer_core(iglob_oc)
-          endif
-          b_accel_crust_mantle(1,iglob_mantle) = b_accel_crust_mantle(1,iglob_mantle) + weight*nx*pressure
-          b_accel_crust_mantle(2,iglob_mantle) = b_accel_crust_mantle(2,iglob_mantle) + weight*ny*pressure
-          b_accel_crust_mantle(3,iglob_mantle) = b_accel_crust_mantle(3,iglob_mantle) + weight*nz*pressure
-        endif
-
       enddo
     enddo
   enddo
@@ -327,14 +270,13 @@
 !-------------------------------------------------------------------------------------------------
 !
 
-  subroutine compute_coupling_ICB_fluid(displ_inner_core,b_displ_inner_core, &
-                            accel_inner_core,b_accel_inner_core, &
-                            ibool_inner_core,ibelm_top_inner_core,  &
-                            accel_outer_core,b_accel_outer_core, &
-                            normal_bottom_outer_core,jacobian2D_bottom_outer_core, &
-                            wgllwgll_xy,ibool_outer_core,ibelm_bottom_outer_core, &
-                            RHO_BOTTOM_OC,minus_g_icb, &
-                            SIMULATION_TYPE,nspec2D_top)
+  subroutine compute_coupling_ICB_fluid(displ_inner_core,accel_inner_core, &
+                                        ibool_inner_core,ibelm_top_inner_core,  &
+                                        accel_outer_core, &
+                                        normal_bottom_outer_core,jacobian2D_bottom_outer_core, &
+                                        wgllwgll_xy,ibool_outer_core,ibelm_bottom_outer_core, &
+                                        RHO_BOTTOM_OC,minus_g_icb, &
+                                        nspec2D_top)
 
   use constants_solver
 
@@ -342,14 +284,11 @@
 
   real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_INNER_CORE) :: &
     displ_inner_core,accel_inner_core
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_INNER_CORE_ADJOINT) :: &
-    b_displ_inner_core,b_accel_inner_core
 
   integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE) :: ibool_inner_core
   integer, dimension(NSPEC2D_TOP_IC) :: ibelm_top_inner_core
 
   real(kind=CUSTOM_REAL), dimension(NGLOB_OUTER_CORE) :: accel_outer_core
-  real(kind=CUSTOM_REAL), dimension(NGLOB_OUTER_CORE_ADJOINT) :: b_accel_outer_core
 
   real(kind=CUSTOM_REAL), dimension(NDIM,NGLLX,NGLLY,NSPEC2D_BOTTOM_OC) :: normal_bottom_outer_core
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NSPEC2D_BOTTOM_OC) :: jacobian2D_bottom_outer_core
@@ -358,11 +297,10 @@
   integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_OUTER_CORE) :: ibool_outer_core
   integer, dimension(NSPEC2D_BOTTOM_OC) :: ibelm_bottom_outer_core
 
-  double precision RHO_BOTTOM_OC
-  real(kind=CUSTOM_REAL) minus_g_icb
+  double precision :: RHO_BOTTOM_OC
+  real(kind=CUSTOM_REAL) :: minus_g_icb
 
-  integer SIMULATION_TYPE
-  integer nspec2D_top
+  integer :: nspec2D_top
 
   ! local parameters
   real(kind=CUSTOM_REAL) :: pressure,nx,ny,nz,weight
@@ -407,19 +345,6 @@
         accel_inner_core(2,iglob_inner_core) = accel_inner_core(2,iglob_inner_core) - weight*ny*pressure
         accel_inner_core(3,iglob_inner_core) = accel_inner_core(3,iglob_inner_core) - weight*nz*pressure
 
-        if (SIMULATION_TYPE == 3) then
-          if(GRAVITY_VAL) then
-            pressure = RHO_BOTTOM_OC * (- b_accel_outer_core(iglob) &
-               + minus_g_icb *(b_displ_inner_core(1,iglob_inner_core)*nx &
-               + b_displ_inner_core(2,iglob_inner_core)*ny + b_displ_inner_core(3,iglob_inner_core)*nz))
-          else
-            pressure = - RHO_BOTTOM_OC * b_accel_outer_core(iglob)
-          endif
-          b_accel_inner_core(1,iglob_inner_core) = b_accel_inner_core(1,iglob_inner_core) - weight*nx*pressure
-          b_accel_inner_core(2,iglob_inner_core) = b_accel_inner_core(2,iglob_inner_core) - weight*ny*pressure
-          b_accel_inner_core(3,iglob_inner_core) = b_accel_inner_core(3,iglob_inner_core) - weight*nz*pressure
-        endif
-
       enddo
     enddo
   enddo
@@ -435,10 +360,10 @@
                                     rmass_ocean_load,normal_top_crust_mantle, &
                                     ibool_crust_mantle,ibelm_top_crust_mantle, &
                                     updated_dof_ocean_load,NGLOB_XY, &
-                                    nspec_top, &
-                                    ABSORBING_CONDITIONS,EXACT_MASS_MATRIX_FOR_ROTATION,USE_LDDRK)
+                                    nspec_top)
 
   use constants_solver
+  use specfem_par,only: ABSORBING_CONDITIONS,EXACT_MASS_MATRIX_FOR_ROTATION,USE_LDDRK
 
   implicit none
 
@@ -464,9 +389,8 @@
   integer, dimension(NSPEC2D_TOP_CM) :: ibelm_top_crust_mantle
 
   logical, dimension(NGLOB_CRUST_MANTLE_OCEANS) :: updated_dof_ocean_load
-  logical :: ABSORBING_CONDITIONS,EXACT_MASS_MATRIX_FOR_ROTATION,USE_LDDRK
 
-  integer nspec_top
+  integer :: nspec_top
 
   ! local parameters
   real(kind=CUSTOM_REAL) :: force_normal_comp

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_acoustic_calling_routine.F90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_acoustic_calling_routine.F90	2013-08-30 00:30:39 UTC (rev 22745)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_acoustic_calling_routine.F90	2013-08-30 12:45:44 UTC (rev 22746)
@@ -27,40 +27,33 @@
 
   subroutine compute_forces_acoustic()
 
+! acoustic domains for forward or adjoint simulations (SIMULATION_TYPE == 1 or 2 )
+
   use specfem_par
-  use specfem_par_crustmantle,only: displ_crust_mantle,b_displ_crust_mantle, &
+  use specfem_par_crustmantle,only: displ_crust_mantle, &
                                     ibool_crust_mantle,ibelm_bottom_crust_mantle
-  use specfem_par_innercore,only: displ_inner_core,b_displ_inner_core, &
+  use specfem_par_innercore,only: displ_inner_core, &
                                   ibool_inner_core,ibelm_top_inner_core
   use specfem_par_outercore
   implicit none
 
   ! local parameters
-  real(kind=CUSTOM_REAL) :: time,b_time
+  real(kind=CUSTOM_REAL) :: time
   ! non blocking MPI
   ! iphase: iphase = 1 is for computing outer elements in the outer_core,
   !              iphase = 2 is for computing inner elements in the outer core (former icall parameter)
   integer :: iphase
   logical :: phase_is_inner
 
+  ! checks
+  if( SIMULATION_TYPE == 3 ) return
+
   ! compute internal forces in the fluid region
   if(CUSTOM_REAL == SIZE_REAL) then
     time = sngl((dble(it-1)*DT-t0)*scale_t_inv)
   else
     time = (dble(it-1)*DT-t0)*scale_t_inv
   endif
-  if (SIMULATION_TYPE == 3) then
-    ! note on backward/reconstructed wavefields:
-    !       b_time for b_displ( it=1 ) corresponds to (NSTEP - 1)*DT - t0  (after Newmark scheme...)
-    !       as we start with saved wavefields b_displ( 1 ) <-> displ( NSTEP ) which correspond
-    !       to a time (NSTEP - (it-1) - 1)*DT - t0
-    !       for reconstructing the rotational contributions
-    if(CUSTOM_REAL == SIZE_REAL) then
-      b_time = sngl((dble(NSTEP-it)*DT-t0)*scale_t_inv)
-    else
-      b_time = (dble(NSTEP-it)*DT-t0)*scale_t_inv
-    endif
-  endif
 
   ! ****************************************************
   !   big loop over all spectral elements in the fluid
@@ -97,29 +90,10 @@
                                       displ_outer_core,accel_outer_core, &
                                       div_displ_outer_core,phase_is_inner)
       endif
-
-      ! adjoint / kernel runs
-      if (SIMULATION_TYPE == 3) then
-        if( USE_DEVILLE_PRODUCTS_VAL ) then
-          ! uses Deville et al. (2002) routine
-          call compute_forces_outer_core_Dev(b_time,b_deltat,b_two_omega_earth, &
-                                      NSPEC_OUTER_CORE_ROT_ADJOINT,NGLOB_OUTER_CORE_ADJOINT, &
-                                      b_A_array_rotation,b_B_array_rotation, &
-                                      b_displ_outer_core,b_accel_outer_core, &
-                                      b_div_displ_outer_core,phase_is_inner)
-        else
-          call compute_forces_outer_core(b_time,b_deltat,b_two_omega_earth, &
-                                      NSPEC_OUTER_CORE_ROT_ADJOINT,NGLOB_OUTER_CORE_ADJOINT, &
-                                      b_A_array_rotation,b_B_array_rotation, &
-                                      b_displ_outer_core,b_accel_outer_core, &
-                                      b_div_displ_outer_core,phase_is_inner)
-        endif
-      endif
-
     else
       ! on GPU
-      ! includes both forward and adjoint/kernel simulations
-      call compute_forces_outer_core_cuda(Mesh_pointer,iphase,time,b_time)
+      ! includes FORWARD_OR_ADJOINT == 1
+      call compute_forces_outer_core_cuda(Mesh_pointer,iphase,time,1)
     endif
 
 
@@ -127,7 +101,7 @@
     if( iphase == 1 ) then
 
        ! Stacey absorbing boundaries
-       if(NCHUNKS_VAL /= 6 .and. ABSORBING_CONDITIONS) call compute_stacey_outer_core()
+       if(NCHUNKS_VAL /= 6 .and. ABSORBING_CONDITIONS) call compute_stacey_outer_core_forward()
 
        ! ****************************************************
        ! **********  add matching with solid part  **********
@@ -139,23 +113,23 @@
           !--- couple with mantle at the top of the outer core
           !---
           if(ACTUALLY_COUPLE_FLUID_CMB) &
-               call compute_coupling_fluid_CMB(displ_crust_mantle,b_displ_crust_mantle, &
-               ibool_crust_mantle,ibelm_bottom_crust_mantle,  &
-               accel_outer_core,b_accel_outer_core, &
-               normal_top_outer_core,jacobian2D_top_outer_core, &
-               wgllwgll_xy,ibool_outer_core,ibelm_top_outer_core, &
-               SIMULATION_TYPE,NSPEC2D_TOP(IREGION_OUTER_CORE))
+               call compute_coupling_fluid_CMB(displ_crust_mantle, &
+                                               ibool_crust_mantle,ibelm_bottom_crust_mantle,  &
+                                               accel_outer_core, &
+                                               normal_top_outer_core,jacobian2D_top_outer_core, &
+                                               wgllwgll_xy,ibool_outer_core,ibelm_top_outer_core, &
+                                               NSPEC2D_TOP(IREGION_OUTER_CORE))
 
           !---
           !--- couple with inner core at the bottom of the outer core
           !---
           if(ACTUALLY_COUPLE_FLUID_ICB) &
-               call compute_coupling_fluid_ICB(displ_inner_core,b_displ_inner_core, &
-               ibool_inner_core,ibelm_top_inner_core,  &
-               accel_outer_core,b_accel_outer_core, &
-               normal_bottom_outer_core,jacobian2D_bottom_outer_core, &
-               wgllwgll_xy,ibool_outer_core,ibelm_bottom_outer_core, &
-               SIMULATION_TYPE,NSPEC2D_BOTTOM(IREGION_OUTER_CORE))
+               call compute_coupling_fluid_ICB(displ_inner_core, &
+                                               ibool_inner_core,ibelm_top_inner_core,  &
+                                               accel_outer_core, &
+                                               normal_bottom_outer_core,jacobian2D_bottom_outer_core, &
+                                               wgllwgll_xy,ibool_outer_core,ibelm_bottom_outer_core, &
+                                               NSPEC2D_BOTTOM(IREGION_OUTER_CORE))
 
        else
           ! on GPU
@@ -163,12 +137,12 @@
           !--- couple with mantle at the top of the outer core
           !---
           if( ACTUALLY_COUPLE_FLUID_CMB ) &
-               call compute_coupling_fluid_cmb_cuda(Mesh_pointer)
+               call compute_coupling_fluid_cmb_cuda(Mesh_pointer,1)
           !---
           !--- couple with inner core at the bottom of the outer core
           !---
           if( ACTUALLY_COUPLE_FLUID_ICB ) &
-               call compute_coupling_fluid_icb_cuda(Mesh_pointer)
+               call compute_coupling_fluid_icb_cuda(Mesh_pointer,1)
 
        endif
     endif ! iphase == 1
@@ -177,7 +151,6 @@
     ! in outer core
     if( iphase == 1 ) then
       ! sends out MPI interface data (non-blocking)
-
       if(.NOT. GPU_MODE) then
         ! on CPU
         call assemble_MPI_scalar_s(NPROCTOT_VAL,NGLOB_OUTER_CORE, &
@@ -198,31 +171,6 @@
                                 request_send_scalar_oc,request_recv_scalar_oc, &
                                 1) ! <-- 1 == fwd accel
       endif
-
-      ! adjoint simulations
-      if( SIMULATION_TYPE == 3 ) then
-        if(.NOT. GPU_MODE) then
-          ! on CPU
-          call assemble_MPI_scalar_s(NPROCTOT_VAL,NGLOB_OUTER_CORE, &
-                                b_accel_outer_core, &
-                                b_buffer_send_scalar_outer_core,b_buffer_recv_scalar_outer_core, &
-                                num_interfaces_outer_core,max_nibool_interfaces_oc, &
-                                nibool_interfaces_outer_core,ibool_interfaces_outer_core,&
-                                my_neighbours_outer_core, &
-                                b_request_send_scalar_oc,b_request_recv_scalar_oc)
-        else
-          ! on GPU
-          ! outer core
-          call assemble_MPI_scalar_send_cuda(Mesh_pointer,NPROCTOT_VAL, &
-                                b_buffer_send_scalar_outer_core,b_buffer_recv_scalar_outer_core, &
-                                num_interfaces_outer_core,max_nibool_interfaces_oc, &
-                                nibool_interfaces_outer_core,&
-                                my_neighbours_outer_core, &
-                                b_request_send_scalar_oc,b_request_recv_scalar_oc, &
-                                3) ! <-- 3 == adjoint b_accel
-        endif ! GPU
-      endif ! SIMULATION_TYPE == 3
-
     else
       ! make sure the last communications are finished and processed
       ! waits for send/receive requests to be completed and assembles values
@@ -242,26 +190,6 @@
                                 request_send_scalar_oc,request_recv_scalar_oc, &
                                 1) ! <-- 1 == fwd accel
       endif
-
-      ! adjoint simulations
-      if( SIMULATION_TYPE == 3 ) then
-        if(.NOT. GPU_MODE) then
-          ! on CPU
-          call assemble_MPI_scalar_w(NPROCTOT_VAL,NGLOB_OUTER_CORE, &
-                                b_accel_outer_core, &
-                                b_buffer_recv_scalar_outer_core,num_interfaces_outer_core,&
-                                max_nibool_interfaces_oc, &
-                                nibool_interfaces_outer_core,ibool_interfaces_outer_core, &
-                                b_request_send_scalar_oc,b_request_recv_scalar_oc)
-        else
-          ! on GPU
-          call assemble_MPI_scalar_write_cuda(Mesh_pointer,NPROCTOT_VAL, &
-                                b_buffer_recv_scalar_outer_core, &
-                                num_interfaces_outer_core,max_nibool_interfaces_oc, &
-                                b_request_send_scalar_oc,b_request_recv_scalar_oc, &
-                                3) ! <-- 3 == adjoint b_accel
-        endif
-      endif ! SIMULATION_TYPE == 3
     endif ! iphase == 1
 
   enddo ! iphase
@@ -273,18 +201,209 @@
     ! on CPU
     call update_veloc_acoustic(NGLOB_OUTER_CORE,veloc_outer_core,accel_outer_core, &
                                deltatover2,rmass_outer_core)
+  else
+    ! on GPU
+    ! includes FORWARD_OR_ADJOINT == 1
+    call kernel_3_outer_core_cuda(Mesh_pointer,deltatover2,1)
+  endif
 
-   ! adjoint / kernel runs
-    if (SIMULATION_TYPE == 3) &
-      call update_veloc_acoustic(NGLOB_OUTER_CORE_ADJOINT,b_veloc_outer_core,b_accel_outer_core, &
-                                 b_deltatover2,rmass_outer_core)
+  end subroutine compute_forces_acoustic
 
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+
+  subroutine compute_forces_acoustic_backward()
+
+! backward/reconstructed wavefields only
+
+  use specfem_par
+  use specfem_par_crustmantle,only: b_displ_crust_mantle, &
+                                    ibool_crust_mantle,ibelm_bottom_crust_mantle
+  use specfem_par_innercore,only: b_displ_inner_core, &
+                                  ibool_inner_core,ibelm_top_inner_core
+  use specfem_par_outercore
+  implicit none
+
+  ! local parameters
+  real(kind=CUSTOM_REAL) :: b_time
+  ! non blocking MPI
+  ! iphase: iphase = 1 is for computing outer elements in the outer_core,
+  !              iphase = 2 is for computing inner elements in the outer core (former icall parameter)
+  integer :: iphase
+  logical :: phase_is_inner
+
+  ! checks
+  if( SIMULATION_TYPE /= 3 ) return
+
+  ! compute internal forces in the fluid region
+
+  ! note on backward/reconstructed wavefields:
+  !       b_time for b_displ( it=1 ) corresponds to (NSTEP - 1)*DT - t0  (after Newmark scheme...)
+  !       as we start with saved wavefields b_displ( 1 ) <-> displ( NSTEP ) which correspond
+  !       to a time (NSTEP - (it-1) - 1)*DT - t0
+  !       for reconstructing the rotational contributions
+  if(CUSTOM_REAL == SIZE_REAL) then
+    b_time = sngl((dble(NSTEP-it)*DT-t0)*scale_t_inv)
   else
+    b_time = (dble(NSTEP-it)*DT-t0)*scale_t_inv
+  endif
+
+  ! ****************************************************
+  !   big loop over all spectral elements in the fluid
+  ! ****************************************************
+
+  ! distinguishes two runs: for elements on MPI interfaces, and elements within the partitions
+  do iphase=1,2
+
+    ! first, iphase == 1 for points on MPI interfaces (thus outer elements)
+    ! second, iphase == 2 for points purely inside partition (thus inner elements)
+    !
+    ! compute all the outer elements first, then sends out non blocking MPI communication
+    ! and continues computing inner elements (overlapping)
+    if( iphase == 1 ) then
+      phase_is_inner = .false.
+    else
+      phase_is_inner = .true.
+    endif
+
+    if( .not. GPU_MODE ) then
+      ! on CPU
+      ! adjoint / kernel runs
+      if( USE_DEVILLE_PRODUCTS_VAL ) then
+        ! uses Deville et al. (2002) routine
+        call compute_forces_outer_core_Dev(b_time,b_deltat,b_two_omega_earth, &
+                                    NSPEC_OUTER_CORE_ROT_ADJOINT,NGLOB_OUTER_CORE_ADJOINT, &
+                                    b_A_array_rotation,b_B_array_rotation, &
+                                    b_displ_outer_core,b_accel_outer_core, &
+                                    b_div_displ_outer_core,phase_is_inner)
+      else
+        call compute_forces_outer_core(b_time,b_deltat,b_two_omega_earth, &
+                                    NSPEC_OUTER_CORE_ROT_ADJOINT,NGLOB_OUTER_CORE_ADJOINT, &
+                                    b_A_array_rotation,b_B_array_rotation, &
+                                    b_displ_outer_core,b_accel_outer_core, &
+                                    b_div_displ_outer_core,phase_is_inner)
+      endif
+    else
+      ! on GPU
+      ! includes FORWARD_OR_ADJOINT == 3
+      call compute_forces_outer_core_cuda(Mesh_pointer,iphase,b_time,3)
+    endif
+
+
+    ! computes additional contributions to acceleration field
+    if( iphase == 1 ) then
+
+       ! Stacey absorbing boundaries
+       if(NCHUNKS_VAL /= 6 .and. ABSORBING_CONDITIONS) call compute_stacey_outer_core_backward()
+
+       ! ****************************************************
+       ! **********  add matching with solid part  **********
+       ! ****************************************************
+       ! only for elements in first matching layer in the fluid
+       if( .not. GPU_MODE ) then
+          ! on CPU
+          !---
+          !--- couple with mantle at the top of the outer core
+          !---
+          if(ACTUALLY_COUPLE_FLUID_CMB) &
+               call compute_coupling_fluid_CMB(b_displ_crust_mantle, &
+                                               ibool_crust_mantle,ibelm_bottom_crust_mantle,  &
+                                               b_accel_outer_core, &
+                                               normal_top_outer_core,jacobian2D_top_outer_core, &
+                                               wgllwgll_xy,ibool_outer_core,ibelm_top_outer_core, &
+                                               NSPEC2D_TOP(IREGION_OUTER_CORE))
+
+          !---
+          !--- couple with inner core at the bottom of the outer core
+          !---
+          if(ACTUALLY_COUPLE_FLUID_ICB) &
+               call compute_coupling_fluid_ICB(b_displ_inner_core, &
+                                               ibool_inner_core,ibelm_top_inner_core,  &
+                                               b_accel_outer_core, &
+                                               normal_bottom_outer_core,jacobian2D_bottom_outer_core, &
+                                               wgllwgll_xy,ibool_outer_core,ibelm_bottom_outer_core, &
+                                               NSPEC2D_BOTTOM(IREGION_OUTER_CORE))
+       else
+          ! on GPU
+          !---
+          !--- couple with mantle at the top of the outer core
+          !---
+          if( ACTUALLY_COUPLE_FLUID_CMB ) &
+               call compute_coupling_fluid_cmb_cuda(Mesh_pointer,3)
+          !---
+          !--- couple with inner core at the bottom of the outer core
+          !---
+          if( ACTUALLY_COUPLE_FLUID_ICB ) &
+               call compute_coupling_fluid_icb_cuda(Mesh_pointer,3)
+
+       endif
+    endif ! iphase == 1
+
+    ! assemble all the contributions between slices using MPI
+    ! in outer core
+    if( iphase == 1 ) then
+      ! sends out MPI interface data (non-blocking)
+      ! adjoint simulations
+      if(.NOT. GPU_MODE) then
+        ! on CPU
+        call assemble_MPI_scalar_s(NPROCTOT_VAL,NGLOB_OUTER_CORE, &
+                              b_accel_outer_core, &
+                              b_buffer_send_scalar_outer_core,b_buffer_recv_scalar_outer_core, &
+                              num_interfaces_outer_core,max_nibool_interfaces_oc, &
+                              nibool_interfaces_outer_core,ibool_interfaces_outer_core,&
+                              my_neighbours_outer_core, &
+                              b_request_send_scalar_oc,b_request_recv_scalar_oc)
+      else
+        ! on GPU
+        ! outer core
+        call assemble_MPI_scalar_send_cuda(Mesh_pointer,NPROCTOT_VAL, &
+                              b_buffer_send_scalar_outer_core,b_buffer_recv_scalar_outer_core, &
+                              num_interfaces_outer_core,max_nibool_interfaces_oc, &
+                              nibool_interfaces_outer_core,&
+                              my_neighbours_outer_core, &
+                              b_request_send_scalar_oc,b_request_recv_scalar_oc, &
+                              3) ! <-- 3 == adjoint b_accel
+      endif ! GPU
+    else
+      ! make sure the last communications are finished and processed
+      ! waits for send/receive requests to be completed and assembles values
+      ! adjoint simulations
+      if(.NOT. GPU_MODE) then
+        ! on CPU
+        call assemble_MPI_scalar_w(NPROCTOT_VAL,NGLOB_OUTER_CORE, &
+                              b_accel_outer_core, &
+                              b_buffer_recv_scalar_outer_core,num_interfaces_outer_core,&
+                              max_nibool_interfaces_oc, &
+                              nibool_interfaces_outer_core,ibool_interfaces_outer_core, &
+                              b_request_send_scalar_oc,b_request_recv_scalar_oc)
+      else
+        ! on GPU
+        call assemble_MPI_scalar_write_cuda(Mesh_pointer,NPROCTOT_VAL, &
+                              b_buffer_recv_scalar_outer_core, &
+                              num_interfaces_outer_core,max_nibool_interfaces_oc, &
+                              b_request_send_scalar_oc,b_request_recv_scalar_oc, &
+                              3) ! <-- 3 == adjoint b_accel
+      endif
+    endif ! iphase == 1
+
+  enddo ! iphase
+
+  ! Newmark time scheme:
+  ! corrector terms for fluid parts
+  ! (multiply by the inverse of the mass matrix and update velocity)
+  if(.NOT. GPU_MODE) then
+    ! on CPU
+    ! adjoint / kernel runs
+    call update_veloc_acoustic(NGLOB_OUTER_CORE_ADJOINT,b_veloc_outer_core,b_accel_outer_core, &
+                                 b_deltatover2,rmass_outer_core)
+  else
     ! on GPU
-    call kernel_3_outer_core_cuda(Mesh_pointer, &
-                                 deltatover2,SIMULATION_TYPE,b_deltatover2)
+    ! includes FORWARD_OR_ADJOINT == 3
+    call kernel_3_outer_core_cuda(Mesh_pointer,b_deltatover2,3)
   endif
 
-  end subroutine compute_forces_acoustic
+  end subroutine compute_forces_acoustic_backward
 
 

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_crust_mantle_Dev.F90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_crust_mantle_Dev.F90	2013-08-30 00:30:39 UTC (rev 22745)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_crust_mantle_Dev.F90	2013-08-30 12:45:44 UTC (rev 22746)
@@ -69,6 +69,10 @@
     nspec_outer => nspec_outer_crust_mantle, &
     nspec_inner => nspec_inner_crust_mantle
 
+#ifdef FORCE_VECTORIZATION
+  use specfem_par,only: wgllwgll_xy_3D,wgllwgll_xz_3D,wgllwgll_yz_3D
+#endif
+
 !daniel: att - debug
 !  use specfem_par,only: it,NSTEP
 
@@ -421,8 +425,7 @@
       sum_terms(2,ijk,1,1) = - (fac1*newtempy1(ijk,1,1) + fac2*newtempy2(ijk,1,1) + fac3*newtempy3(ijk,1,1))
       sum_terms(3,ijk,1,1) = - (fac1*newtempz1(ijk,1,1) + fac2*newtempz2(ijk,1,1) + fac3*newtempz3(ijk,1,1))
     enddo
-
-    ! add gravity terms
+    ! adds gravity terms
     if(GRAVITY_VAL) then
       do ijk = 1,NDIM*NGLLCUBE
         sum_terms(ijk,1,1,1) = sum_terms(ijk,1,1,1) + rho_s_H(ijk,1,1,1)
@@ -435,15 +438,12 @@
         do i=1,NGLLX
           fac2 = wgllwgll_xz(i,k)
           fac3 = wgllwgll_xy(i,j)
-
-          ! sum contributions
+          ! sums contributions
           sum_terms(1,i,j,k) = - (fac1*newtempx1(i,j,k) + fac2*newtempx2(i,j,k) + fac3*newtempx3(i,j,k))
           sum_terms(2,i,j,k) = - (fac1*newtempy1(i,j,k) + fac2*newtempy2(i,j,k) + fac3*newtempy3(i,j,k))
           sum_terms(3,i,j,k) = - (fac1*newtempz1(i,j,k) + fac2*newtempz2(i,j,k) + fac3*newtempz3(i,j,k))
-
-          ! add gravity terms
+          ! adds gravity terms
           if(GRAVITY_VAL) sum_terms(:,i,j,k) = sum_terms(:,i,j,k) + rho_s_H(:,i,j,k)
-
         enddo ! NGLLX
       enddo ! NGLLY
     enddo ! NGLLZ
@@ -507,23 +507,12 @@
 
     ! save deviatoric strain for Runge-Kutta scheme
     if(COMPUTE_AND_STORE_STRAIN) then
-
-#ifdef FORCE_VECTORIZATION
-      do ijk = 1,NGLLCUBE
-        epsilondev_xx(ijk,1,1,ispec) = epsilondev_loc(1,ijk,1,1)
-        epsilondev_yy(ijk,1,1,ispec) = epsilondev_loc(2,ijk,1,1)
-        epsilondev_xy(ijk,1,1,ispec) = epsilondev_loc(3,ijk,1,1)
-        epsilondev_xz(ijk,1,1,ispec) = epsilondev_loc(4,ijk,1,1)
-        epsilondev_yz(ijk,1,1,ispec) = epsilondev_loc(5,ijk,1,1)
-      enddo
-#else
       epsilondev_xx(:,:,:,ispec) = epsilondev_loc(1,:,:,:)
       epsilondev_yy(:,:,:,ispec) = epsilondev_loc(2,:,:,:)
       epsilondev_xy(:,:,:,ispec) = epsilondev_loc(3,:,:,:)
       epsilondev_xz(:,:,:,ispec) = epsilondev_loc(4,:,:,:)
       epsilondev_yz(:,:,:,ispec) = epsilondev_loc(5,:,:,:)
     endif
-#endif
 
   enddo ! of spectral element loop NSPEC_CRUST_MANTLE
 

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_crust_mantle_noDev.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_crust_mantle_noDev.f90	2013-08-30 00:30:39 UTC (rev 22745)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_crust_mantle_noDev.f90	2013-08-30 12:45:44 UTC (rev 22746)
@@ -273,12 +273,13 @@
           endif
 
           ! precompute terms for attenuation if needed
-          if( ATTENUATION_VAL .and. (ATTENUATION_3D_VAL .or. ATTENUATION_1D_WITH_3D_STORAGE_VAL)) then
-            one_minus_sum_beta_use = one_minus_sum_beta(i,j,k,ispec)
+          if( ATTENUATION_VAL ) then
+            if( ATTENUATION_3D_VAL .or. ATTENUATION_1D_WITH_3D_STORAGE_VAL ) then
+              one_minus_sum_beta_use = one_minus_sum_beta(i,j,k,ispec)
+            else
+              one_minus_sum_beta_use = one_minus_sum_beta(1,1,1,ispec)
+            endif
             minus_sum_beta =  one_minus_sum_beta_use - 1.0_CUSTOM_REAL
-          else if( ATTENUATION_VAL ) then
-            one_minus_sum_beta_use = one_minus_sum_beta(1,1,1,ispec)
-            minus_sum_beta =  one_minus_sum_beta_use - 1.0_CUSTOM_REAL
           endif
 
           !

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_inner_core_Dev.F90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_inner_core_Dev.F90	2013-08-30 00:30:39 UTC (rev 22745)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_inner_core_Dev.F90	2013-08-30 12:45:44 UTC (rev 22746)
@@ -62,6 +62,10 @@
     nspec_outer => nspec_outer_inner_core, &
     nspec_inner => nspec_inner_inner_core
 
+#ifdef FORCE_VECTORIZATION
+  use specfem_par,only: wgllwgll_xy_3D,wgllwgll_xz_3D,wgllwgll_yz_3D
+#endif
+
   implicit none
 
   integer :: NSPEC,NGLOB,NSPEC_ATT
@@ -171,6 +175,10 @@
   integer :: num_elements,ispec_p
   integer :: iphase
 
+#ifdef FORCE_VECTORIZATION
+  integer :: ijk
+#endif
+
 ! ****************************************************
 !   big loop over all spectral elements in the solid
 ! ****************************************************
@@ -334,12 +342,6 @@
               epsilondev_loc(5,i,j,k) = 0.5 * duzdyl_plus_duydzl
             endif
 
-            if(ATTENUATION_VAL .and. (ATTENUATION_3D_VAL .or. ATTENUATION_1D_WITH_3D_STORAGE_VAL)) then
-              minus_sum_beta =  one_minus_sum_beta(i,j,k,ispec) - 1.0_CUSTOM_REAL
-            else if( ATTENUATION_VAL ) then
-              minus_sum_beta =  one_minus_sum_beta(1,1,1,ispec) - 1.0_CUSTOM_REAL
-            endif
-
             if(ANISOTROPIC_INNER_CORE_VAL) then
               ! elastic tensor for hexagonal symmetry in reduced notation:
               !
@@ -365,6 +367,11 @@
 
               ! use unrelaxed parameters if attenuation
               if(ATTENUATION_VAL) then
+                if( ATTENUATION_3D_VAL .or. ATTENUATION_1D_WITH_3D_STORAGE_VAL ) then
+                  minus_sum_beta =  one_minus_sum_beta(i,j,k,ispec) - 1.0_CUSTOM_REAL
+                else
+                  minus_sum_beta =  one_minus_sum_beta(1,1,1,ispec) - 1.0_CUSTOM_REAL
+                endif
                 mul = muvstore(i,j,k,ispec)
                 c11l = c11l + FOUR_THIRDS * minus_sum_beta * mul
                 c12l = c12l - TWO_THIRDS * minus_sum_beta * mul
@@ -387,10 +394,12 @@
               mul = muvstore(i,j,k,ispec)
 
               ! use unrelaxed parameters if attenuation
-              if(ATTENUATION_VAL .and. (ATTENUATION_3D_VAL .or. ATTENUATION_1D_WITH_3D_STORAGE_VAL)) then
-                mul = mul * one_minus_sum_beta(i,j,k,ispec)
-              else if( ATTENUATION_VAL ) then
-                mul = mul * one_minus_sum_beta(1,1,1,ispec)
+              if(ATTENUATION_VAL ) then
+                if( ATTENUATION_3D_VAL .or. ATTENUATION_1D_WITH_3D_STORAGE_VAL ) then
+                  mul = mul * one_minus_sum_beta(i,j,k,ispec)
+                else
+                  mul = mul * one_minus_sum_beta(1,1,1,ispec)
+                endif
               endif
 
               lambdalplus2mul = kappal + FOUR_THIRDS * mul
@@ -623,6 +632,21 @@
         enddo
       enddo
 
+#ifdef FORCE_VECTORIZATION
+      do ijk=1,NGLLCUBE
+        fac1 = wgllwgll_yz_3D(ijk,1,1)
+        fac2 = wgllwgll_xz_3D(ijk,1,1)
+        fac3 = wgllwgll_xy_3D(ijk,1,1)
+        sum_terms(1,ijk,1,1) = - (fac1*newtempx1(ijk,1,1) + fac2*newtempx2(ijk,1,1) + fac3*newtempx3(ijk,1,1))
+        sum_terms(2,ijk,1,1) = - (fac1*newtempy1(ijk,1,1) + fac2*newtempy2(ijk,1,1) + fac3*newtempy3(ijk,1,1))
+        sum_terms(3,ijk,1,1) = - (fac1*newtempz1(ijk,1,1) + fac2*newtempz2(ijk,1,1) + fac3*newtempz3(ijk,1,1))
+      enddo
+      if(GRAVITY_VAL) then
+        do ijk = 1,NDIM*NGLLCUBE
+          sum_terms(ijk,1,1,1) = sum_terms(ijk,1,1,1) + rho_s_H(ijk,1,1,1)
+        enddo
+      endif
+#else
       do k=1,NGLLZ
         do j=1,NGLLY
           fac1 = wgllwgll_yz(j,k)
@@ -637,8 +661,26 @@
           enddo
         enddo
       enddo
+#endif
 
       ! sum contributions from each element to the global mesh and add gravity terms
+#ifdef FORCE_VECTORIZATION
+! we can force vectorization using a compiler directive here because we know that there is no dependency
+! inside a given spectral element, since all the global points of a local elements are different by definition
+! (only common points between different elements can be the same)
+! IBM, Portland PGI, and Intel and Cray syntax (Intel and Cray are the same)
+!IBM* ASSERT (NODEPS)
+!pgi$ ivdep
+!DIR$ IVDEP
+      do ijk = 1,NGLLCUBE
+        iglob = ibool(ijk,1,1,ispec)
+        ! do NOT use array syntax ":" for the three statements below
+        ! otherwise most compilers will not be able to vectorize the outer loop
+        accel_inner_core(1,iglob) = accel_inner_core(1,iglob) + sum_terms(1,ijk,1,1)
+        accel_inner_core(2,iglob) = accel_inner_core(2,iglob) + sum_terms(2,ijk,1,1)
+        accel_inner_core(3,iglob) = accel_inner_core(3,iglob) + sum_terms(3,ijk,1,1)
+      enddo
+#else
       do k=1,NGLLZ
         do j=1,NGLLY
           do i=1,NGLLX
@@ -647,6 +689,7 @@
           enddo
         enddo
       enddo
+#endif
 
       ! use Runge-Kutta scheme to march memory variables in time
       ! convention for attenuation

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_inner_core_noDev.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_inner_core_noDev.f90	2013-08-30 00:30:39 UTC (rev 22745)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_inner_core_noDev.f90	2013-08-30 12:45:44 UTC (rev 22746)
@@ -215,7 +215,7 @@
           gammayl = gammay(i,j,k,ispec)
           gammazl = gammaz(i,j,k,ispec)
 
-! compute the jacobian
+          ! compute the jacobian
           jacobianl = 1._CUSTOM_REAL / (xixl*(etayl*gammazl-etazl*gammayl) &
                         - xiyl*(etaxl*gammazl-etazl*gammaxl) &
                         + xizl*(etaxl*gammayl-etayl*gammaxl))
@@ -232,7 +232,7 @@
           duzdyl = xiyl*tempz1l + etayl*tempz2l + gammayl*tempz3l
           duzdzl = xizl*tempz1l + etazl*tempz2l + gammazl*tempz3l
 
-! precompute some sums to save CPU time
+          ! precompute some sums to save CPU time
           duxdxl_plus_duydyl = duxdxl + duydyl
           duxdxl_plus_duzdzl = duxdxl + duzdzl
           duydyl_plus_duzdzl = duydyl + duzdzl
@@ -255,40 +255,36 @@
             epsilondev_loc(5,i,j,k) = 0.5 * duzdyl_plus_duydzl
           endif
 
-          ! precompute terms for attenuation if needed
-          if( ATTENUATION_VAL .and. (ATTENUATION_3D_VAL .or. ATTENUATION_1D_WITH_3D_STORAGE_VAL)) then
-            minus_sum_beta =  one_minus_sum_beta(i,j,k,ispec) - 1.0_CUSTOM_REAL
-          else if( ATTENUATION_VAL ) then
-            minus_sum_beta =  one_minus_sum_beta(1,1,1,ispec) - 1.0_CUSTOM_REAL
-          endif
-
           if(ANISOTROPIC_INNER_CORE_VAL) then
-
-! elastic tensor for hexagonal symmetry in reduced notation:
-!
-!      c11 c12 c13  0   0        0
-!      c12 c11 c13  0   0        0
-!      c13 c13 c33  0   0        0
-!       0   0   0  c44  0        0
-!       0   0   0   0  c44       0
-!       0   0   0   0   0  (c11-c12)/2
-!
-!       in terms of the A, C, L, N and F of Love (1927):
-!
-!       c11 = A
-!       c12 = A-2N
-!       c13 = F
-!       c33 = C
-!       c44 = L
-
+            ! elastic tensor for hexagonal symmetry in reduced notation:
+            !
+            !      c11 c12 c13  0   0        0
+            !      c12 c11 c13  0   0        0
+            !      c13 c13 c33  0   0        0
+            !       0   0   0  c44  0        0
+            !       0   0   0   0  c44       0
+            !       0   0   0   0   0  (c11-c12)/2
+            !
+            !       in terms of the A, C, L, N and F of Love (1927):
+            !
+            !       c11 = A
+            !       c12 = A-2N
+            !       c13 = F
+            !       c33 = C
+            !       c44 = L
             c11l = c11store(i,j,k,ispec)
             c12l = c12store(i,j,k,ispec)
             c13l = c13store(i,j,k,ispec)
             c33l = c33store(i,j,k,ispec)
             c44l = c44store(i,j,k,ispec)
 
-! use unrelaxed parameters if attenuation
+            ! use unrelaxed parameters if attenuation
             if(ATTENUATION_VAL) then
+              if( ATTENUATION_3D_VAL .or. ATTENUATION_1D_WITH_3D_STORAGE_VAL ) then
+                minus_sum_beta =  one_minus_sum_beta(i,j,k,ispec) - 1.0_CUSTOM_REAL
+              else
+                minus_sum_beta =  one_minus_sum_beta(1,1,1,ispec) - 1.0_CUSTOM_REAL
+              endif
               mul = muvstore(i,j,k,ispec)
               c11l = c11l + FOUR_THIRDS * minus_sum_beta * mul
               c12l = c12l - TWO_THIRDS * minus_sum_beta * mul
@@ -305,23 +301,24 @@
             sigma_yz = c44l*duzdyl_plus_duydzl
           else
 
-! inner core with no anisotropy, use kappav and muv for instance
-! layer with no anisotropy, use kappav and muv for instance
+            ! inner core with no anisotropy, use kappav and muv for instance
+            ! layer with no anisotropy, use kappav and muv for instance
             kappal = kappavstore(i,j,k,ispec)
             mul = muvstore(i,j,k,ispec)
 
             ! use unrelaxed parameters if attenuation
-            if( ATTENUATION_VAL .and. (ATTENUATION_3D_VAL .or. ATTENUATION_1D_WITH_3D_STORAGE_VAL)) then
-              mul = mul * one_minus_sum_beta(i,j,k,ispec)
-            else if( ATTENUATION_VAL ) then
-              mul = mul * one_minus_sum_beta(1,1,1,ispec)
+            if( ATTENUATION_VAL ) then
+              if( ATTENUATION_3D_VAL .or. ATTENUATION_1D_WITH_3D_STORAGE_VAL ) then
+                mul = mul * one_minus_sum_beta(i,j,k,ispec)
+              else
+                mul = mul * one_minus_sum_beta(1,1,1,ispec)
+              endif
             endif
 
             lambdalplus2mul = kappal + FOUR_THIRDS * mul
             lambdal = lambdalplus2mul - 2.*mul
 
-! compute stress sigma
-
+            ! compute stress sigma
             sigma_xx = lambdalplus2mul*duxdxl + lambdal*duydyl_plus_duzdzl
             sigma_yy = lambdalplus2mul*duydyl + lambdal*duxdxl_plus_duzdzl
             sigma_zz = lambdalplus2mul*duzdzl + lambdal*duxdxl_plus_duydyl
@@ -332,7 +329,7 @@
 
           endif
 
-! subtract memory variables if attenuation
+          ! subtract memory variables if attenuation
           if( ATTENUATION_VAL .and. .not. PARTIAL_PHYS_DISPERSION_ONLY_VAL ) then
             do i_SLS = 1,N_SLS
               R_xx_val = R_xx(i_SLS,i,j,k,ispec)
@@ -346,24 +343,23 @@
             enddo
           endif
 
-! define symmetric components of sigma for gravity
+          ! define symmetric components of sigma for gravity
           sigma_yx = sigma_xy
           sigma_zx = sigma_xz
           sigma_zy = sigma_yz
 
-! compute non-symmetric terms for gravity
+          ! compute non-symmetric terms for gravity
           if(GRAVITY_VAL) then
 
-! use mesh coordinates to get theta and phi
-! x y and z contain r theta and phi
-
+            ! use mesh coordinates to get theta and phi
+            ! x y and z contain r theta and phi
             iglob = ibool(i,j,k,ispec)
             radius = dble(xstore(iglob))
             theta = dble(ystore(iglob))
             phi = dble(zstore(iglob))
 
-! make sure radius is never zero even for points at center of cube
-! because we later divide by radius
+            ! make sure radius is never zero even for points at center of cube
+            ! because we later divide by radius
             if(radius < 100.d0 / R_EARTH) radius = 100.d0 / R_EARTH
 
             cos_theta = dcos(theta)
@@ -371,23 +367,22 @@
             cos_phi = dcos(phi)
             sin_phi = dsin(phi)
 
-! get g, rho and dg/dr=dg
-! spherical components of the gravitational acceleration
-! for efficiency replace with lookup table every 100 m in radial direction
-! make sure we never use zero for point exactly at the center of the Earth
+            ! get g, rho and dg/dr=dg
+            ! spherical components of the gravitational acceleration
+            ! for efficiency replace with lookup table every 100 m in radial direction
+            ! make sure we never use zero for point exactly at the center of the Earth
             int_radius = max(1,nint(radius * R_EARTH_KM * 10.d0))
             minus_g = minus_gravity_table(int_radius)
             minus_dg = minus_deriv_gravity_table(int_radius)
             rho = density_table(int_radius)
 
-! Cartesian components of the gravitational acceleration
+            ! Cartesian components of the gravitational acceleration
             gxl = minus_g*sin_theta*cos_phi
             gyl = minus_g*sin_theta*sin_phi
             gzl = minus_g*cos_theta
 
-! Cartesian components of gradient of gravitational acceleration
-! obtained from spherical components
-
+            ! Cartesian components of gradient of gravitational acceleration
+            ! obtained from spherical components
             minus_g_over_radius = minus_g / radius
             minus_dg_plus_g_over_radius = minus_dg - minus_g_over_radius
 
@@ -405,15 +400,15 @@
 
             iglob = ibool(i,j,k,ispec)
 
-! distinguish between single and double precision for reals
+            ! distinguish between single and double precision for reals
             if(CUSTOM_REAL == SIZE_REAL) then
 
-! get displacement and multiply by density to compute G tensor
+              ! get displacement and multiply by density to compute G tensor
               sx_l = rho * dble(displ_inner_core(1,iglob))
               sy_l = rho * dble(displ_inner_core(2,iglob))
               sz_l = rho * dble(displ_inner_core(3,iglob))
 
-! compute G tensor from s . g and add to sigma (not symmetric)
+              ! compute G tensor from s . g and add to sigma (not symmetric)
               sigma_xx = sigma_xx + sngl(sy_l*gyl + sz_l*gzl)
               sigma_yy = sigma_yy + sngl(sx_l*gxl + sz_l*gzl)
               sigma_zz = sigma_zz + sngl(sx_l*gxl + sy_l*gyl)
@@ -427,7 +422,7 @@
               sigma_yz = sigma_yz - sngl(sy_l * gzl)
               sigma_zy = sigma_zy - sngl(sz_l * gyl)
 
-! precompute vector
+              ! precompute vector
               factor = dble(jacobianl) * wgll_cube(i,j,k)
               rho_s_H(1,i,j,k) = sngl(factor * (sx_l * Hxxl + sy_l * Hxyl + sz_l * Hxzl))
               rho_s_H(2,i,j,k) = sngl(factor * (sx_l * Hxyl + sy_l * Hyyl + sz_l * Hyzl))
@@ -435,12 +430,12 @@
 
             else
 
-! get displacement and multiply by density to compute G tensor
+              ! get displacement and multiply by density to compute G tensor
               sx_l = rho * displ_inner_core(1,iglob)
               sy_l = rho * displ_inner_core(2,iglob)
               sz_l = rho * displ_inner_core(3,iglob)
 
-! compute G tensor from s . g and add to sigma (not symmetric)
+              ! compute G tensor from s . g and add to sigma (not symmetric)
               sigma_xx = sigma_xx + sy_l*gyl + sz_l*gzl
               sigma_yy = sigma_yy + sx_l*gxl + sz_l*gzl
               sigma_zz = sigma_zz + sx_l*gxl + sy_l*gyl
@@ -454,7 +449,7 @@
               sigma_yz = sigma_yz - sy_l * gzl
               sigma_zy = sigma_zy - sz_l * gyl
 
-! precompute vector
+              ! precompute vector
               factor = jacobianl * wgll_cube(i,j,k)
               rho_s_H(1,i,j,k) = factor * (sx_l * Hxxl + sy_l * Hxyl + sz_l * Hxzl)
               rho_s_H(2,i,j,k) = factor * (sx_l * Hxyl + sy_l * Hyyl + sz_l * Hyzl)
@@ -464,8 +459,7 @@
 
           endif  ! end of section with gravity terms
 
-! form dot product with test vector, non-symmetric form
-
+          ! form dot product with test vector, non-symmetric form
           tempx1(i,j,k) = jacobianl * (sigma_xx*xixl + sigma_yx*xiyl + sigma_zx*xizl)
           tempy1(i,j,k) = jacobianl * (sigma_xy*xixl + sigma_yy*xiyl + sigma_zy*xizl)
           tempz1(i,j,k) = jacobianl * (sigma_xz*xixl + sigma_yz*xiyl + sigma_zz*xizl)
@@ -533,7 +527,7 @@
       enddo
     enddo
 
-! sum contributions from each element to the global mesh and add gravity terms
+    ! sum contributions from each element to the global mesh and add gravity terms
     do k=1,NGLLZ
       do j=1,NGLLY
         do i=1,NGLLX
@@ -598,8 +592,7 @@
     endif
 
     if (COMPUTE_AND_STORE_STRAIN) then
-! save deviatoric strain for Runge-Kutta scheme
-      !epsilondev(:,:,:,:,ispec) = epsilondev_loc(:,:,:,:)
+      ! save deviatoric strain for Runge-Kutta scheme
       do k=1,NGLLZ
         do j=1,NGLLY
           do i=1,NGLLX
@@ -611,7 +604,6 @@
           enddo
         enddo
       enddo
-
     endif
 
   endif   ! end test to exclude fictitious elements in central cube

Copied: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_outer_core_Dev.F90 (from rev 22734, seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_outer_core_Dev.f90)
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_outer_core_Dev.F90	                        (rev 0)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_outer_core_Dev.F90	2013-08-30 12:45:44 UTC (rev 22746)
@@ -0,0 +1,456 @@
+!=====================================================================
+!
+!          S p e c f e m 3 D  G l o b e  V e r s i o n  6 . 0
+!          --------------------------------------------------
+!
+!          Main authors: Dimitri Komatitsch and Jeroen Tromp
+!                        Princeton University, USA
+!             and CNRS / INRIA / University of Pau, France
+! (c) Princeton University and CNRS / INRIA / University of Pau
+!                            August 2013
+!
+! This program is free software; you can redistribute it and/or modify
+! it under the terms of the GNU General Public License as published by
+! the Free Software Foundation; either version 2 of the License, or
+! (at your option) any later version.
+!
+! This program is distributed in the hope that it will be useful,
+! but WITHOUT ANY WARRANTY; without even the implied warranty of
+! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+! GNU General Public License for more details.
+!
+! You should have received a copy of the GNU General Public License along
+! with this program; if not, write to the Free Software Foundation, Inc.,
+! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+!
+!=====================================================================
+
+  subroutine compute_forces_outer_core_Dev(time,deltat,two_omega_earth, &
+                                          NSPEC,NGLOB, &
+                                          A_array_rotation,B_array_rotation, &
+                                          displfluid,accelfluid, &
+                                          div_displfluid,phase_is_inner)
+
+! this routine is optimized for NGLLX = NGLLY = NGLLZ = 5 using the Deville et al. (2002) inlined matrix-matrix products
+
+  use constants_solver
+
+  use specfem_par,only: &
+    hprime_xx,hprime_xxT,hprimewgll_xx,hprimewgll_xxT,wgllwgll_xy,wgllwgll_xz,wgllwgll_yz,wgll_cube, &
+    minus_rho_g_over_kappa_fluid,d_ln_density_dr_table, &
+    MOVIE_VOLUME
+
+  use specfem_par_outercore,only: &
+    xstore => xstore_outer_core,ystore => ystore_outer_core,zstore => zstore_outer_core, &
+    xix => xix_outer_core,xiy => xiy_outer_core,xiz => xiz_outer_core, &
+    etax => etax_outer_core,etay => etay_outer_core,etaz => etaz_outer_core, &
+    gammax => gammax_outer_core,gammay => gammay_outer_core,gammaz => gammaz_outer_core, &
+    ibool => ibool_outer_core, &
+    phase_ispec_inner => phase_ispec_inner_outer_core, &
+    nspec_outer => nspec_outer_outer_core, &
+    nspec_inner => nspec_inner_outer_core
+
+#ifdef FORCE_VECTORIZATION
+  use specfem_par,only: wgllwgll_xy_3D,wgllwgll_xz_3D,wgllwgll_yz_3D
+#endif
+
+  implicit none
+
+  integer :: NSPEC,NGLOB
+
+  ! for the Euler scheme for rotation
+  real(kind=CUSTOM_REAL) time,deltat,two_omega_earth
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC) :: &
+    A_array_rotation,B_array_rotation
+
+  ! displacement and acceleration
+  real(kind=CUSTOM_REAL), dimension(NGLOB) :: displfluid,accelfluid
+
+  ! divergence of displacement
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_OUTER_CORE_ADJOINT) :: div_displfluid
+
+  ! inner/outer element run flag
+  logical :: phase_is_inner
+
+  ! local parameters
+
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: tempx1,tempx2,tempx3
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: newtempx1,newtempx2,newtempx3
+  ! for gravity
+  integer :: int_radius
+  double precision :: radius,theta,phi,gxl,gyl,gzl
+  double precision :: cos_theta,sin_theta,cos_phi,sin_phi
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: gravity_term
+  ! for the Euler scheme for rotation
+  real(kind=CUSTOM_REAL) :: two_omega_deltat,cos_two_omega_t,sin_two_omega_t,A_rotation,B_rotation, &
+       ux_rotation,uy_rotation,dpotentialdx_with_rot,dpotentialdy_with_rot
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: source_euler_A,source_euler_B
+
+  integer :: ispec,iglob
+  integer :: i,j,k
+  real(kind=CUSTOM_REAL) :: xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl
+  real(kind=CUSTOM_REAL) :: dpotentialdxl,dpotentialdyl,dpotentialdzl
+  real(kind=CUSTOM_REAL) :: sum_terms
+
+  ! manually inline the calls to the Deville et al. (2002) routines
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: dummyx_loc
+
+  real(kind=CUSTOM_REAL), dimension(NGLLX,m2) :: B1_m1_m2_5points
+  real(kind=CUSTOM_REAL), dimension(m1,m2) :: C1_m1_m2_5points
+  real(kind=CUSTOM_REAL), dimension(m1,m2) :: E1_m1_m2_5points
+
+  equivalence(dummyx_loc,B1_m1_m2_5points)
+  equivalence(tempx1,C1_m1_m2_5points)
+  equivalence(newtempx1,E1_m1_m2_5points)
+
+  real(kind=CUSTOM_REAL), dimension(m2,NGLLX) :: A1_mxm_m2_m1_5points
+  real(kind=CUSTOM_REAL), dimension(m2,m1) :: C1_mxm_m2_m1_5points
+  real(kind=CUSTOM_REAL), dimension(m2,m1) :: E1_mxm_m2_m1_5points
+
+  equivalence(dummyx_loc,A1_mxm_m2_m1_5points)
+  equivalence(tempx3,C1_mxm_m2_m1_5points)
+  equivalence(newtempx3,E1_mxm_m2_m1_5points)
+
+  double precision, dimension(NGLLX,NGLLY,NGLLZ) :: temp_gxl,temp_gyl,temp_gzl
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: &
+    displ_times_grad_x_ln_rho,displ_times_grad_y_ln_rho,displ_times_grad_z_ln_rho
+
+! integer :: computed_elements
+  integer :: num_elements,ispec_p
+  integer :: iphase
+
+! ****************************************************
+!   big loop over all spectral elements in the fluid
+! ****************************************************
+
+  if (NSPEC_OUTER_CORE_ADJOINT /= 1 .and. (.not. phase_is_inner) ) div_displfluid(:,:,:,:) = 0._CUSTOM_REAL
+
+! computed_elements = 0
+  if( .not. phase_is_inner ) then
+    iphase = 1
+    num_elements = nspec_outer
+  else
+    iphase = 2
+    num_elements = nspec_inner
+  endif
+
+  do ispec_p = 1,num_elements
+
+    ispec = phase_ispec_inner(ispec_p,iphase)
+
+    ! only compute element which belong to current phase (inner or outer elements)
+
+    do k=1,NGLLZ
+      do j=1,NGLLY
+        do i=1,NGLLX
+          iglob = ibool(i,j,k,ispec)
+
+          ! stores "displacement"
+          dummyx_loc(i,j,k) = displfluid(iglob)
+
+          ! pre-computes factors
+          ! use mesh coordinates to get theta and phi
+          ! x y z contain r theta phi
+          radius = dble(xstore(iglob))
+          theta = dble(ystore(iglob))
+          phi = dble(zstore(iglob))
+
+          cos_theta = dcos(theta)
+          sin_theta = dsin(theta)
+          cos_phi = dcos(phi)
+          sin_phi = dsin(phi)
+
+          int_radius = nint(radius * R_EARTH_KM * 10.d0)
+
+          if( .not. GRAVITY_VAL ) then
+            ! grad(rho)/rho in Cartesian components
+            displ_times_grad_x_ln_rho(i,j,k) = dummyx_loc(i,j,k) &
+                  * sngl(sin_theta * cos_phi * d_ln_density_dr_table(int_radius))
+            displ_times_grad_y_ln_rho(i,j,k) = dummyx_loc(i,j,k) &
+                  * sngl(sin_theta * sin_phi * d_ln_density_dr_table(int_radius))
+            displ_times_grad_z_ln_rho(i,j,k) = dummyx_loc(i,j,k) &
+                  * sngl(cos_theta * d_ln_density_dr_table(int_radius))
+          else
+            ! Cartesian components of the gravitational acceleration
+            ! integrate and multiply by rho / Kappa
+            temp_gxl(i,j,k) = sin_theta*cos_phi
+            temp_gyl(i,j,k) = sin_theta*sin_phi
+            temp_gzl(i,j,k) = cos_theta
+          endif
+
+        enddo
+      enddo
+    enddo
+
+    ! subroutines adapted from Deville, Fischer and Mund, High-order methods
+    ! for incompressible fluid flow, Cambridge University Press (2002),
+    ! pages 386 and 389 and Figure 8.3.1
+    do j=1,m2
+      do i=1,m1
+        C1_m1_m2_5points(i,j) = hprime_xx(i,1)*B1_m1_m2_5points(1,j) + &
+                                hprime_xx(i,2)*B1_m1_m2_5points(2,j) + &
+                                hprime_xx(i,3)*B1_m1_m2_5points(3,j) + &
+                                hprime_xx(i,4)*B1_m1_m2_5points(4,j) + &
+                                hprime_xx(i,5)*B1_m1_m2_5points(5,j)
+      enddo
+    enddo
+
+    do k = 1,NGLLX
+      do j=1,m1
+        do i=1,m1
+          tempx2(i,j,k) = dummyx_loc(i,1,k)*hprime_xxT(1,j) + &
+                          dummyx_loc(i,2,k)*hprime_xxT(2,j) + &
+                          dummyx_loc(i,3,k)*hprime_xxT(3,j) + &
+                          dummyx_loc(i,4,k)*hprime_xxT(4,j) + &
+                          dummyx_loc(i,5,k)*hprime_xxT(5,j)
+        enddo
+      enddo
+    enddo
+
+    do j=1,m1
+      do i=1,m2
+        C1_mxm_m2_m1_5points(i,j) = A1_mxm_m2_m1_5points(i,1)*hprime_xxT(1,j) + &
+                                    A1_mxm_m2_m1_5points(i,2)*hprime_xxT(2,j) + &
+                                    A1_mxm_m2_m1_5points(i,3)*hprime_xxT(3,j) + &
+                                    A1_mxm_m2_m1_5points(i,4)*hprime_xxT(4,j) + &
+                                    A1_mxm_m2_m1_5points(i,5)*hprime_xxT(5,j)
+      enddo
+    enddo
+
+
+    do k=1,NGLLZ
+      do j=1,NGLLY
+        do i=1,NGLLX
+
+          ! get derivatives of velocity potential with respect to x, y and z
+          xixl = xix(i,j,k,ispec)
+          xiyl = xiy(i,j,k,ispec)
+          xizl = xiz(i,j,k,ispec)
+          etaxl = etax(i,j,k,ispec)
+          etayl = etay(i,j,k,ispec)
+          etazl = etaz(i,j,k,ispec)
+          gammaxl = gammax(i,j,k,ispec)
+          gammayl = gammay(i,j,k,ispec)
+          gammazl = gammaz(i,j,k,ispec)
+
+          ! compute the jacobian
+          jacobianl = 1._CUSTOM_REAL / (xixl*(etayl*gammazl-etazl*gammayl) &
+                        - xiyl*(etaxl*gammazl-etazl*gammaxl) &
+                        + xizl*(etaxl*gammayl-etayl*gammaxl))
+
+          dpotentialdxl = xixl*tempx1(i,j,k) + etaxl*tempx2(i,j,k) + gammaxl*tempx3(i,j,k)
+          dpotentialdyl = xiyl*tempx1(i,j,k) + etayl*tempx2(i,j,k) + gammayl*tempx3(i,j,k)
+          dpotentialdzl = xizl*tempx1(i,j,k) + etazl*tempx2(i,j,k) + gammazl*tempx3(i,j,k)
+
+          ! compute contribution of rotation and add to gradient of potential
+          ! this term has no Z component
+          if(ROTATION_VAL) then
+
+            ! store the source for the Euler scheme for A_rotation and B_rotation
+            two_omega_deltat = deltat * two_omega_earth
+
+            cos_two_omega_t = cos(two_omega_earth*time)
+            sin_two_omega_t = sin(two_omega_earth*time)
+
+            ! time step deltat of Euler scheme is included in the source
+            source_euler_A(i,j,k) = two_omega_deltat &
+                  * (cos_two_omega_t * dpotentialdyl + sin_two_omega_t * dpotentialdxl)
+            source_euler_B(i,j,k) = two_omega_deltat &
+                  * (sin_two_omega_t * dpotentialdyl - cos_two_omega_t * dpotentialdxl)
+
+            A_rotation = A_array_rotation(i,j,k,ispec)
+            B_rotation = B_array_rotation(i,j,k,ispec)
+
+            ux_rotation =   A_rotation*cos_two_omega_t + B_rotation*sin_two_omega_t
+            uy_rotation = - A_rotation*sin_two_omega_t + B_rotation*cos_two_omega_t
+
+            dpotentialdx_with_rot = dpotentialdxl + ux_rotation
+            dpotentialdy_with_rot = dpotentialdyl + uy_rotation
+
+          else
+
+            dpotentialdx_with_rot = dpotentialdxl
+            dpotentialdy_with_rot = dpotentialdyl
+
+          endif  ! end of section with rotation
+
+          ! add (chi/rho)grad(rho) term in no gravity case
+          if(.not. GRAVITY_VAL) then
+
+            ! With regards to the non-gravitating case: we cannot set N^2 = 0 *and* let g = 0.
+            ! We can *either* assume N^2 = 0 but keep gravity g, *or* we can assume that gravity
+            ! is negligible to begin with, as in our GJI 2002a, in which case N does not arise.
+            ! We get:
+            !
+            ! \ddot\chi = \rho^{-1}\kappa\bdel\cdot(\bdel\chi+\chi\bdel\ln\rho)
+            !
+            ! Then the displacement is
+            !
+            ! \bu = \bdel\chi+\chi\bdel\ln\rho = \rho^{-1}\bdel(\rho\chi)
+            !
+            ! and the pressure is
+            !
+            ! p = -\rho\ddot{\chi}
+            !
+            ! Thus in our 2002b GJI paper eqn (21) is wrong, and equation (41)
+            ! in our AGU monograph is incorrect; these equations should be replaced by
+            !
+            ! \ddot\chi = \rho^{-1}\kappa\bdel\cdot(\bdel\chi+\chi\bdel\ln\rho)
+            !
+            ! Note that the fluid potential we use in GJI 2002a differs from the one used here:
+            !
+            ! \chi_GJI2002a = \rho\partial\t\chi
+            !
+            ! such that
+            !
+            ! \bv = \partial_t\bu=\rho^{-1}\bdel\chi_GJI2002a  (GJI 2002a eqn 20)
+            !
+            ! p = - \partial_t\chi_GJI2002a (GJI 2002a eqn 19)
+
+            ! use mesh coordinates to get theta and phi
+            ! x y z contain r theta phi
+            dpotentialdx_with_rot = dpotentialdx_with_rot + displ_times_grad_x_ln_rho(i,j,k)
+            dpotentialdy_with_rot = dpotentialdy_with_rot + displ_times_grad_y_ln_rho(i,j,k)
+            dpotentialdzl = dpotentialdzl + displ_times_grad_z_ln_rho(i,j,k)
+
+         else  ! if gravity is turned on
+
+            ! compute divergence of displacment
+            gxl = temp_gxl(i,j,k)
+            gyl = temp_gyl(i,j,k)
+            gzl = temp_gzl(i,j,k)
+
+            ! distinguish between single and double precision for reals
+            if(CUSTOM_REAL == SIZE_REAL) then
+              gravity_term(i,j,k) = &
+                      sngl( minus_rho_g_over_kappa_fluid(int_radius) &
+                      * dble(jacobianl) * wgll_cube(i,j,k) &
+                      * (dble(dpotentialdx_with_rot) * gxl  &
+                         + dble(dpotentialdy_with_rot) * gyl &
+                         + dble(dpotentialdzl) * gzl) )
+            else
+              gravity_term(i,j,k) = minus_rho_g_over_kappa_fluid(int_radius) * &
+                        jacobianl * wgll_cube(i,j,k) &
+                        * (dpotentialdx_with_rot * gxl  &
+                          + dpotentialdy_with_rot * gyl &
+                          + dpotentialdzl * gzl)
+            endif
+
+            ! divergence of displacement field with gravity on
+            ! note: these calculations are only considered for SIMULATION_TYPE == 1 .and. SAVE_FORWARD
+            !          and one has set MOVIE_VOLUME_TYPE == 4 when MOVIE_VOLUME is .true.;
+            !         in case of SIMULATION_TYPE == 3, it gets overwritten by compute_kernels_outer_core()
+            if (NSPEC_OUTER_CORE_ADJOINT /= 1 .and. MOVIE_VOLUME) then
+              div_displfluid(i,j,k,ispec) =  &
+                        minus_rho_g_over_kappa_fluid(int_radius) &
+                        * (dpotentialdx_with_rot * gxl &
+                         + dpotentialdy_with_rot * gyl &
+                         + dpotentialdzl * gzl)
+            endif
+
+          endif
+
+          tempx1(i,j,k) = jacobianl*(xixl*dpotentialdx_with_rot &
+                                   + xiyl*dpotentialdy_with_rot + xizl*dpotentialdzl)
+          tempx2(i,j,k) = jacobianl*(etaxl*dpotentialdx_with_rot &
+                                   + etayl*dpotentialdy_with_rot + etazl*dpotentialdzl)
+          tempx3(i,j,k) = jacobianl*(gammaxl*dpotentialdx_with_rot &
+                                   + gammayl*dpotentialdy_with_rot + gammazl*dpotentialdzl)
+
+        enddo
+      enddo
+    enddo
+
+    ! subroutines adapted from Deville, Fischer and Mund, High-order methods
+    ! for incompressible fluid flow, Cambridge University Press (2002),
+    ! pages 386 and 389 and Figure 8.3.1
+    do j=1,m2
+      do i=1,m1
+        E1_m1_m2_5points(i,j) = hprimewgll_xxT(i,1)*C1_m1_m2_5points(1,j) + &
+                                hprimewgll_xxT(i,2)*C1_m1_m2_5points(2,j) + &
+                                hprimewgll_xxT(i,3)*C1_m1_m2_5points(3,j) + &
+                                hprimewgll_xxT(i,4)*C1_m1_m2_5points(4,j) + &
+                                hprimewgll_xxT(i,5)*C1_m1_m2_5points(5,j)
+      enddo
+    enddo
+
+    do k = 1,NGLLX
+      do j=1,m1
+        do i=1,m1
+          newtempx2(i,j,k) = tempx2(i,1,k)*hprimewgll_xx(1,j) + &
+                             tempx2(i,2,k)*hprimewgll_xx(2,j) + &
+                             tempx2(i,3,k)*hprimewgll_xx(3,j) + &
+                             tempx2(i,4,k)*hprimewgll_xx(4,j) + &
+                             tempx2(i,5,k)*hprimewgll_xx(5,j)
+        enddo
+      enddo
+    enddo
+
+    do j=1,m1
+      do i=1,m2
+        E1_mxm_m2_m1_5points(i,j) = C1_mxm_m2_m1_5points(i,1)*hprimewgll_xx(1,j) + &
+                                    C1_mxm_m2_m1_5points(i,2)*hprimewgll_xx(2,j) + &
+                                    C1_mxm_m2_m1_5points(i,3)*hprimewgll_xx(3,j) + &
+                                    C1_mxm_m2_m1_5points(i,4)*hprimewgll_xx(4,j) + &
+                                    C1_mxm_m2_m1_5points(i,5)*hprimewgll_xx(5,j)
+      enddo
+    enddo
+
+#ifdef FORCE_VECTORIZATION
+    ! sum contributions from each element to the global mesh and add gravity term
+    do ijk=1,NGLLCUBE
+      sum_terms(ijk,1,1) = - (wgllwgll_yz_3D(ijk,1,1)*newtempx1(ijk,1,1) &
+                            + wgllwgll_xz_3D(ijk,1,1)*newtempx2(ijk,1,1) &
+                            + wgllwgll_xy_3D(ijk,1,1)*newtempx3(ijk,1,1))
+    enddo
+    if(GRAVITY_VAL) then
+      do ijk = 1,NGLLCUBE
+        sum_terms(ijk,1,1) = sum_terms(ijk,1,1) + gravity_term(ijk,1,1)
+      enddo
+    endif
+! we can force vectorization using a compiler directive here because we know that there is no dependency
+! inside a given spectral element, since all the global points of a local elements are different by definition
+! (only common points between different elements can be the same)
+! IBM, Portland PGI, and Intel and Cray syntax (Intel and Cray are the same)
+!IBM* ASSERT (NODEPS)
+!pgi$ ivdep
+!DIR$ IVDEP
+    do ijk = 1,NGLLCUBE
+      iglob = ibool(ijk,1,1,ispec)
+      accelfluid(iglob) = accelfluid(iglob) + sum_terms(ijk,1,1)
+    enddo
+    ! update rotation term with Euler scheme
+    if(ROTATION_VAL) then
+      do ijk = 1,NGLLCUBE
+        A_array_rotation(ijk,1,1,ispec) = A_array_rotation(ijk,1,1,ispec) + source_euler_A(ijk,1,1)
+        B_array_rotation(ijk,1,1,ispec) = B_array_rotation(ijk,1,1,ispec) + source_euler_B(ijk,1,1)
+      enddo
+    endif
+#else
+    ! sum contributions from each element to the global mesh and add gravity term
+    do k=1,NGLLZ
+      do j=1,NGLLY
+        do i=1,NGLLX
+          sum_terms = - (wgllwgll_yz(j,k)*newtempx1(i,j,k) &
+                       + wgllwgll_xz(i,k)*newtempx2(i,j,k) &
+                       + wgllwgll_xy(i,j)*newtempx3(i,j,k))
+
+          if(GRAVITY_VAL) sum_terms = sum_terms + gravity_term(i,j,k)
+
+          iglob = ibool(i,j,k,ispec)
+          accelfluid(iglob) = accelfluid(iglob) + sum_terms
+        enddo
+      enddo
+    enddo
+    ! update rotation term with Euler scheme
+    if(ROTATION_VAL) then
+      ! use the source saved above
+      A_array_rotation(:,:,:,ispec) = A_array_rotation(:,:,:,ispec) + source_euler_A(:,:,:)
+      B_array_rotation(:,:,:,ispec) = B_array_rotation(:,:,:,ispec) + source_euler_B(:,:,:)
+    endif
+#endif
+
+  enddo   ! spectral element loop
+
+  end subroutine compute_forces_outer_core_Dev
+

Deleted: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_outer_core_Dev.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_outer_core_Dev.f90	2013-08-30 00:30:39 UTC (rev 22745)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_outer_core_Dev.f90	2013-08-30 12:45:44 UTC (rev 22746)
@@ -1,423 +0,0 @@
-!=====================================================================
-!
-!          S p e c f e m 3 D  G l o b e  V e r s i o n  6 . 0
-!          --------------------------------------------------
-!
-!          Main authors: Dimitri Komatitsch and Jeroen Tromp
-!                        Princeton University, USA
-!             and CNRS / INRIA / University of Pau, France
-! (c) Princeton University and CNRS / INRIA / University of Pau
-!                            August 2013
-!
-! This program is free software; you can redistribute it and/or modify
-! it under the terms of the GNU General Public License as published by
-! the Free Software Foundation; either version 2 of the License, or
-! (at your option) any later version.
-!
-! This program is distributed in the hope that it will be useful,
-! but WITHOUT ANY WARRANTY; without even the implied warranty of
-! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-! GNU General Public License for more details.
-!
-! You should have received a copy of the GNU General Public License along
-! with this program; if not, write to the Free Software Foundation, Inc.,
-! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-!
-!=====================================================================
-
-  subroutine compute_forces_outer_core_Dev(time,deltat,two_omega_earth, &
-                                          NSPEC,NGLOB, &
-                                          A_array_rotation,B_array_rotation, &
-                                          displfluid,accelfluid, &
-                                          div_displfluid,phase_is_inner)
-
-! this routine is optimized for NGLLX = NGLLY = NGLLZ = 5 using the Deville et al. (2002) inlined matrix-matrix products
-
-  use constants_solver
-
-  use specfem_par,only: &
-    hprime_xx,hprime_xxT,hprimewgll_xx,hprimewgll_xxT,wgllwgll_xy,wgllwgll_xz,wgllwgll_yz,wgll_cube, &
-    minus_rho_g_over_kappa_fluid,d_ln_density_dr_table, &
-    MOVIE_VOLUME
-
-  use specfem_par_outercore,only: &
-    xstore => xstore_outer_core,ystore => ystore_outer_core,zstore => zstore_outer_core, &
-    xix => xix_outer_core,xiy => xiy_outer_core,xiz => xiz_outer_core, &
-    etax => etax_outer_core,etay => etay_outer_core,etaz => etaz_outer_core, &
-    gammax => gammax_outer_core,gammay => gammay_outer_core,gammaz => gammaz_outer_core, &
-    ibool => ibool_outer_core, &
-    phase_ispec_inner => phase_ispec_inner_outer_core, &
-    nspec_outer => nspec_outer_outer_core, &
-    nspec_inner => nspec_inner_outer_core
-
-  implicit none
-
-  integer :: NSPEC,NGLOB
-
-  ! for the Euler scheme for rotation
-  real(kind=CUSTOM_REAL) time,deltat,two_omega_earth
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC) :: &
-    A_array_rotation,B_array_rotation
-
-  ! displacement and acceleration
-  real(kind=CUSTOM_REAL), dimension(NGLOB) :: displfluid,accelfluid
-
-  ! divergence of displacement
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_OUTER_CORE_ADJOINT) :: div_displfluid
-
-  ! inner/outer element run flag
-  logical :: phase_is_inner
-
-  ! local parameters
-
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: tempx1,tempx2,tempx3
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: newtempx1,newtempx2,newtempx3
-  ! for gravity
-  integer :: int_radius
-  double precision :: radius,theta,phi,gxl,gyl,gzl
-  double precision :: cos_theta,sin_theta,cos_phi,sin_phi
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: gravity_term
-  ! for the Euler scheme for rotation
-  real(kind=CUSTOM_REAL) :: two_omega_deltat,cos_two_omega_t,sin_two_omega_t,A_rotation,B_rotation, &
-       ux_rotation,uy_rotation,dpotentialdx_with_rot,dpotentialdy_with_rot
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: source_euler_A,source_euler_B
-
-  integer :: ispec,iglob
-  integer :: i,j,k
-  real(kind=CUSTOM_REAL) :: xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl
-  real(kind=CUSTOM_REAL) :: dpotentialdxl,dpotentialdyl,dpotentialdzl
-  real(kind=CUSTOM_REAL) :: sum_terms
-
-  ! manually inline the calls to the Deville et al. (2002) routines
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: dummyx_loc
-
-  real(kind=CUSTOM_REAL), dimension(NGLLX,m2) :: B1_m1_m2_5points
-  real(kind=CUSTOM_REAL), dimension(m1,m2) :: C1_m1_m2_5points
-  real(kind=CUSTOM_REAL), dimension(m1,m2) :: E1_m1_m2_5points
-
-  equivalence(dummyx_loc,B1_m1_m2_5points)
-  equivalence(tempx1,C1_m1_m2_5points)
-  equivalence(newtempx1,E1_m1_m2_5points)
-
-  real(kind=CUSTOM_REAL), dimension(m2,NGLLX) :: A1_mxm_m2_m1_5points
-  real(kind=CUSTOM_REAL), dimension(m2,m1) :: C1_mxm_m2_m1_5points
-  real(kind=CUSTOM_REAL), dimension(m2,m1) :: E1_mxm_m2_m1_5points
-
-  equivalence(dummyx_loc,A1_mxm_m2_m1_5points)
-  equivalence(tempx3,C1_mxm_m2_m1_5points)
-  equivalence(newtempx3,E1_mxm_m2_m1_5points)
-
-  double precision, dimension(NGLLX,NGLLY,NGLLZ) :: temp_gxl,temp_gyl,temp_gzl
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: &
-    displ_times_grad_x_ln_rho,displ_times_grad_y_ln_rho,displ_times_grad_z_ln_rho
-
-! integer :: computed_elements
-  integer :: num_elements,ispec_p
-  integer :: iphase
-
-! ****************************************************
-!   big loop over all spectral elements in the fluid
-! ****************************************************
-
-  if (NSPEC_OUTER_CORE_ADJOINT /= 1 .and. (.not. phase_is_inner) ) div_displfluid(:,:,:,:) = 0._CUSTOM_REAL
-
-! computed_elements = 0
-  if( .not. phase_is_inner ) then
-    iphase = 1
-    num_elements = nspec_outer
-  else
-    iphase = 2
-    num_elements = nspec_inner
-  endif
-
-  do ispec_p = 1,num_elements
-
-    ispec = phase_ispec_inner(ispec_p,iphase)
-
-    ! only compute element which belong to current phase (inner or outer elements)
-
-    do k=1,NGLLZ
-      do j=1,NGLLY
-        do i=1,NGLLX
-          iglob = ibool(i,j,k,ispec)
-
-          ! stores "displacement"
-          dummyx_loc(i,j,k) = displfluid(iglob)
-
-          ! pre-computes factors
-          ! use mesh coordinates to get theta and phi
-          ! x y z contain r theta phi
-          radius = dble(xstore(iglob))
-          theta = dble(ystore(iglob))
-          phi = dble(zstore(iglob))
-
-          cos_theta = dcos(theta)
-          sin_theta = dsin(theta)
-          cos_phi = dcos(phi)
-          sin_phi = dsin(phi)
-
-          int_radius = nint(radius * R_EARTH_KM * 10.d0)
-
-          if( .not. GRAVITY_VAL ) then
-            ! grad(rho)/rho in Cartesian components
-            displ_times_grad_x_ln_rho(i,j,k) = dummyx_loc(i,j,k) &
-                  * sngl(sin_theta * cos_phi * d_ln_density_dr_table(int_radius))
-            displ_times_grad_y_ln_rho(i,j,k) = dummyx_loc(i,j,k) &
-                  * sngl(sin_theta * sin_phi * d_ln_density_dr_table(int_radius))
-            displ_times_grad_z_ln_rho(i,j,k) = dummyx_loc(i,j,k) &
-                  * sngl(cos_theta * d_ln_density_dr_table(int_radius))
-          else
-            ! Cartesian components of the gravitational acceleration
-            ! integrate and multiply by rho / Kappa
-            temp_gxl(i,j,k) = sin_theta*cos_phi
-            temp_gyl(i,j,k) = sin_theta*sin_phi
-            temp_gzl(i,j,k) = cos_theta
-          endif
-
-        enddo
-      enddo
-    enddo
-
-    ! subroutines adapted from Deville, Fischer and Mund, High-order methods
-    ! for incompressible fluid flow, Cambridge University Press (2002),
-    ! pages 386 and 389 and Figure 8.3.1
-    do j=1,m2
-      do i=1,m1
-        C1_m1_m2_5points(i,j) = hprime_xx(i,1)*B1_m1_m2_5points(1,j) + &
-                                hprime_xx(i,2)*B1_m1_m2_5points(2,j) + &
-                                hprime_xx(i,3)*B1_m1_m2_5points(3,j) + &
-                                hprime_xx(i,4)*B1_m1_m2_5points(4,j) + &
-                                hprime_xx(i,5)*B1_m1_m2_5points(5,j)
-      enddo
-    enddo
-
-    do k = 1,NGLLX
-      do j=1,m1
-        do i=1,m1
-          tempx2(i,j,k) = dummyx_loc(i,1,k)*hprime_xxT(1,j) + &
-                          dummyx_loc(i,2,k)*hprime_xxT(2,j) + &
-                          dummyx_loc(i,3,k)*hprime_xxT(3,j) + &
-                          dummyx_loc(i,4,k)*hprime_xxT(4,j) + &
-                          dummyx_loc(i,5,k)*hprime_xxT(5,j)
-        enddo
-      enddo
-    enddo
-
-    do j=1,m1
-      do i=1,m2
-        C1_mxm_m2_m1_5points(i,j) = A1_mxm_m2_m1_5points(i,1)*hprime_xxT(1,j) + &
-                                    A1_mxm_m2_m1_5points(i,2)*hprime_xxT(2,j) + &
-                                    A1_mxm_m2_m1_5points(i,3)*hprime_xxT(3,j) + &
-                                    A1_mxm_m2_m1_5points(i,4)*hprime_xxT(4,j) + &
-                                    A1_mxm_m2_m1_5points(i,5)*hprime_xxT(5,j)
-      enddo
-    enddo
-
-
-    do k=1,NGLLZ
-      do j=1,NGLLY
-        do i=1,NGLLX
-
-          ! get derivatives of velocity potential with respect to x, y and z
-          xixl = xix(i,j,k,ispec)
-          xiyl = xiy(i,j,k,ispec)
-          xizl = xiz(i,j,k,ispec)
-          etaxl = etax(i,j,k,ispec)
-          etayl = etay(i,j,k,ispec)
-          etazl = etaz(i,j,k,ispec)
-          gammaxl = gammax(i,j,k,ispec)
-          gammayl = gammay(i,j,k,ispec)
-          gammazl = gammaz(i,j,k,ispec)
-
-          ! compute the jacobian
-          jacobianl = 1._CUSTOM_REAL / (xixl*(etayl*gammazl-etazl*gammayl) &
-                        - xiyl*(etaxl*gammazl-etazl*gammaxl) &
-                        + xizl*(etaxl*gammayl-etayl*gammaxl))
-
-          dpotentialdxl = xixl*tempx1(i,j,k) + etaxl*tempx2(i,j,k) + gammaxl*tempx3(i,j,k)
-          dpotentialdyl = xiyl*tempx1(i,j,k) + etayl*tempx2(i,j,k) + gammayl*tempx3(i,j,k)
-          dpotentialdzl = xizl*tempx1(i,j,k) + etazl*tempx2(i,j,k) + gammazl*tempx3(i,j,k)
-
-          ! compute contribution of rotation and add to gradient of potential
-          ! this term has no Z component
-          if(ROTATION_VAL) then
-
-            ! store the source for the Euler scheme for A_rotation and B_rotation
-            two_omega_deltat = deltat * two_omega_earth
-
-            cos_two_omega_t = cos(two_omega_earth*time)
-            sin_two_omega_t = sin(two_omega_earth*time)
-
-            ! time step deltat of Euler scheme is included in the source
-            source_euler_A(i,j,k) = two_omega_deltat &
-                  * (cos_two_omega_t * dpotentialdyl + sin_two_omega_t * dpotentialdxl)
-            source_euler_B(i,j,k) = two_omega_deltat &
-                  * (sin_two_omega_t * dpotentialdyl - cos_two_omega_t * dpotentialdxl)
-
-            A_rotation = A_array_rotation(i,j,k,ispec)
-            B_rotation = B_array_rotation(i,j,k,ispec)
-
-            ux_rotation =   A_rotation*cos_two_omega_t + B_rotation*sin_two_omega_t
-            uy_rotation = - A_rotation*sin_two_omega_t + B_rotation*cos_two_omega_t
-
-            dpotentialdx_with_rot = dpotentialdxl + ux_rotation
-            dpotentialdy_with_rot = dpotentialdyl + uy_rotation
-
-          else
-
-            dpotentialdx_with_rot = dpotentialdxl
-            dpotentialdy_with_rot = dpotentialdyl
-
-          endif  ! end of section with rotation
-
-          ! add (chi/rho)grad(rho) term in no gravity case
-          if(.not. GRAVITY_VAL) then
-
-            ! With regards to the non-gravitating case: we cannot set N^2 = 0 *and* let g = 0.
-            ! We can *either* assume N^2 = 0 but keep gravity g, *or* we can assume that gravity
-            ! is negligible to begin with, as in our GJI 2002a, in which case N does not arise.
-            ! We get:
-            !
-            ! \ddot\chi = \rho^{-1}\kappa\bdel\cdot(\bdel\chi+\chi\bdel\ln\rho)
-            !
-            ! Then the displacement is
-            !
-            ! \bu = \bdel\chi+\chi\bdel\ln\rho = \rho^{-1}\bdel(\rho\chi)
-            !
-            ! and the pressure is
-            !
-            ! p = -\rho\ddot{\chi}
-            !
-            ! Thus in our 2002b GJI paper eqn (21) is wrong, and equation (41)
-            ! in our AGU monograph is incorrect; these equations should be replaced by
-            !
-            ! \ddot\chi = \rho^{-1}\kappa\bdel\cdot(\bdel\chi+\chi\bdel\ln\rho)
-            !
-            ! Note that the fluid potential we use in GJI 2002a differs from the one used here:
-            !
-            ! \chi_GJI2002a = \rho\partial\t\chi
-            !
-            ! such that
-            !
-            ! \bv = \partial_t\bu=\rho^{-1}\bdel\chi_GJI2002a  (GJI 2002a eqn 20)
-            !
-            ! p = - \partial_t\chi_GJI2002a (GJI 2002a eqn 19)
-
-            ! use mesh coordinates to get theta and phi
-            ! x y z contain r theta phi
-            dpotentialdx_with_rot = dpotentialdx_with_rot + displ_times_grad_x_ln_rho(i,j,k)
-            dpotentialdy_with_rot = dpotentialdy_with_rot + displ_times_grad_y_ln_rho(i,j,k)
-            dpotentialdzl = dpotentialdzl + displ_times_grad_z_ln_rho(i,j,k)
-
-         else  ! if gravity is turned on
-
-            ! compute divergence of displacment
-            gxl = temp_gxl(i,j,k)
-            gyl = temp_gyl(i,j,k)
-            gzl = temp_gzl(i,j,k)
-
-            ! distinguish between single and double precision for reals
-            if(CUSTOM_REAL == SIZE_REAL) then
-              gravity_term(i,j,k) = &
-                      sngl( minus_rho_g_over_kappa_fluid(int_radius) &
-                      * dble(jacobianl) * wgll_cube(i,j,k) &
-                      * (dble(dpotentialdx_with_rot) * gxl  &
-                         + dble(dpotentialdy_with_rot) * gyl &
-                         + dble(dpotentialdzl) * gzl) )
-            else
-              gravity_term(i,j,k) = minus_rho_g_over_kappa_fluid(int_radius) * &
-                        jacobianl * wgll_cube(i,j,k) &
-                        * (dpotentialdx_with_rot * gxl  &
-                          + dpotentialdy_with_rot * gyl &
-                          + dpotentialdzl * gzl)
-            endif
-
-            ! divergence of displacement field with gravity on
-            ! note: these calculations are only considered for SIMULATION_TYPE == 1 .and. SAVE_FORWARD
-            !          and one has set MOVIE_VOLUME_TYPE == 4 when MOVIE_VOLUME is .true.;
-            !         in case of SIMULATION_TYPE == 3, it gets overwritten by compute_kernels_outer_core()
-            if (NSPEC_OUTER_CORE_ADJOINT /= 1 .and. MOVIE_VOLUME) then
-              div_displfluid(i,j,k,ispec) =  &
-                        minus_rho_g_over_kappa_fluid(int_radius) &
-                        * (dpotentialdx_with_rot * gxl &
-                         + dpotentialdy_with_rot * gyl &
-                         + dpotentialdzl * gzl)
-            endif
-
-          endif
-
-          tempx1(i,j,k) = jacobianl*(xixl*dpotentialdx_with_rot &
-                                   + xiyl*dpotentialdy_with_rot + xizl*dpotentialdzl)
-          tempx2(i,j,k) = jacobianl*(etaxl*dpotentialdx_with_rot &
-                                   + etayl*dpotentialdy_with_rot + etazl*dpotentialdzl)
-          tempx3(i,j,k) = jacobianl*(gammaxl*dpotentialdx_with_rot &
-                                   + gammayl*dpotentialdy_with_rot + gammazl*dpotentialdzl)
-
-        enddo
-      enddo
-    enddo
-
-    ! subroutines adapted from Deville, Fischer and Mund, High-order methods
-    ! for incompressible fluid flow, Cambridge University Press (2002),
-    ! pages 386 and 389 and Figure 8.3.1
-    do j=1,m2
-      do i=1,m1
-        E1_m1_m2_5points(i,j) = hprimewgll_xxT(i,1)*C1_m1_m2_5points(1,j) + &
-                                hprimewgll_xxT(i,2)*C1_m1_m2_5points(2,j) + &
-                                hprimewgll_xxT(i,3)*C1_m1_m2_5points(3,j) + &
-                                hprimewgll_xxT(i,4)*C1_m1_m2_5points(4,j) + &
-                                hprimewgll_xxT(i,5)*C1_m1_m2_5points(5,j)
-      enddo
-    enddo
-
-    do k = 1,NGLLX
-      do j=1,m1
-        do i=1,m1
-          newtempx2(i,j,k) = tempx2(i,1,k)*hprimewgll_xx(1,j) + &
-                             tempx2(i,2,k)*hprimewgll_xx(2,j) + &
-                             tempx2(i,3,k)*hprimewgll_xx(3,j) + &
-                             tempx2(i,4,k)*hprimewgll_xx(4,j) + &
-                             tempx2(i,5,k)*hprimewgll_xx(5,j)
-        enddo
-      enddo
-    enddo
-
-    do j=1,m1
-      do i=1,m2
-        E1_mxm_m2_m1_5points(i,j) = C1_mxm_m2_m1_5points(i,1)*hprimewgll_xx(1,j) + &
-                                    C1_mxm_m2_m1_5points(i,2)*hprimewgll_xx(2,j) + &
-                                    C1_mxm_m2_m1_5points(i,3)*hprimewgll_xx(3,j) + &
-                                    C1_mxm_m2_m1_5points(i,4)*hprimewgll_xx(4,j) + &
-                                    C1_mxm_m2_m1_5points(i,5)*hprimewgll_xx(5,j)
-      enddo
-    enddo
-
-    do k=1,NGLLZ
-      do j=1,NGLLY
-        do i=1,NGLLX
-
-          ! sum contributions from each element to the global mesh and add gravity term
-          sum_terms = - (wgllwgll_yz(j,k)*newtempx1(i,j,k) &
-                       + wgllwgll_xz(i,k)*newtempx2(i,j,k) &
-                       + wgllwgll_xy(i,j)*newtempx3(i,j,k))
-
-          if(GRAVITY_VAL) sum_terms = sum_terms + gravity_term(i,j,k)
-
-          iglob = ibool(i,j,k,ispec)
-          accelfluid(iglob) = accelfluid(iglob) + sum_terms
-
-        enddo
-      enddo
-    enddo
-
-    ! update rotation term with Euler scheme
-    if(ROTATION_VAL) then
-      ! use the source saved above
-      A_array_rotation(:,:,:,ispec) = A_array_rotation(:,:,:,ispec) + source_euler_A(:,:,:)
-      B_array_rotation(:,:,:,ispec) = B_array_rotation(:,:,:,ispec) + source_euler_B(:,:,:)
-    endif
-
-  enddo   ! spectral element loop
-
-  end subroutine compute_forces_outer_core_Dev
-

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_viscoelastic_calling_routine.F90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_viscoelastic_calling_routine.F90	2013-08-30 00:30:39 UTC (rev 22745)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_viscoelastic_calling_routine.F90	2013-08-30 12:45:44 UTC (rev 22746)
@@ -25,12 +25,15 @@
 !
 !=====================================================================
 
+
   subroutine compute_forces_viscoelastic()
 
+! elastic domains for forward or adjoint simulations (SIMULATION_TYPE == 1 or 2 )
+
   use specfem_par
   use specfem_par_crustmantle
   use specfem_par_innercore
-  use specfem_par_outercore,only: accel_outer_core,b_accel_outer_core, &
+  use specfem_par_outercore,only: accel_outer_core, &
                                   normal_top_outer_core,jacobian2D_top_outer_core, &
                                   normal_bottom_outer_core,jacobian2D_bottom_outer_core, &
                                   ibelm_top_outer_core,ibelm_bottom_outer_core, &
@@ -45,41 +48,9 @@
   integer :: iphase
   logical :: phase_is_inner
 
+  ! checks
+  if( SIMULATION_TYPE == 3 ) return
 
-!daniel debug: att - debug
-!  integer :: iglob
-!  logical,parameter :: DEBUG = .false.
-!  if( DEBUG ) then
-!    iglob = ibool_crust_mantle(1,1,1,100)
-!    if( SIMULATION_TYPE == 1) then
-!      if( it == NSTEP .and. myrank == 0 ) then
-!        print*,'last step',it,'Rxx:',R_xx_crust_mantle(1,1,1,1,100),epsilondev_xx_crust_mantle(1,1,1,100), &
-!          displ_crust_mantle(1,iglob),displ_crust_mantle(2,iglob),displ_crust_mantle(3,iglob)
-!      endif
-!      if( it == NSTEP-1 .and. myrank == 0 ) then
-!        print*,'second last step',it,'Rxx:',R_xx_crust_mantle(1,1,1,1,100),epsilondev_xx_crust_mantle(1,1,1,100), &
-!          displ_crust_mantle(1,iglob),displ_crust_mantle(2,iglob),displ_crust_mantle(3,iglob)
-!      endif
-!      if( it == NSTEP-2 .and. myrank == 0 ) then
-!        print*,'third last step',it,'Rxx:',R_xx_crust_mantle(1,1,1,1,100),epsilondev_xx_crust_mantle(1,1,1,100), &
-!          displ_crust_mantle(1,iglob),displ_crust_mantle(2,iglob),displ_crust_mantle(3,iglob)
-!      endif
-!    else if( SIMULATION_TYPE == 3 ) then
-!      if( it == 1 .and. myrank == 0 ) then
-!        print*,'first step',it,'Rxx:',b_R_xx_crust_mantle(1,1,1,1,100),b_epsilondev_xx_crust_mantle(1,1,1,100), &
-!          b_displ_crust_mantle(1,iglob),b_displ_crust_mantle(2,iglob),b_displ_crust_mantle(3,iglob)
-!      endif
-!      if( it == 2 .and. myrank == 0 ) then
-!        print*,'second step',it,'Rxx:',b_R_xx_crust_mantle(1,1,1,1,100),b_epsilondev_xx_crust_mantle(1,1,1,100), &
-!          b_displ_crust_mantle(1,iglob),b_displ_crust_mantle(2,iglob),b_displ_crust_mantle(3,iglob)
-!      endif
-!      if( it == 3 .and. myrank == 0 ) then
-!        print*,'third step',it,'Rxx:',b_R_xx_crust_mantle(1,1,1,1,100),b_epsilondev_xx_crust_mantle(1,1,1,100), &
-!          b_displ_crust_mantle(1,iglob),b_displ_crust_mantle(2,iglob),b_displ_crust_mantle(3,iglob)
-!      endif
-!    endif
-!  endif
-
   ! ****************************************************
   !   big loop over all spectral elements in the solid
   ! ****************************************************
@@ -102,12 +73,10 @@
       phase_is_inner = .true.
     endif
 
-
+    ! compute internal forces in the solid regions
+    ! note: for anisotropy and gravity, x y and z contain r theta and phi
     if( .NOT. GPU_MODE ) then
        ! on CPU
-
-       ! compute internal forces in the solid regions
-       ! note: for anisotropy and gravity, x y and z contain r theta and phi
        if( USE_DEVILLE_PRODUCTS_VAL ) then
           ! uses Deville (2002) optimizations
           ! crust/mantle region
@@ -123,7 +92,6 @@
                eps_trace_over_3_crust_mantle, &
                alphaval,betaval,gammaval, &
                factor_common_crust_mantle,size(factor_common_crust_mantle,5), .false. )
-
           ! inner core region
           call compute_forces_inner_core_Dev( NSPEC_INNER_CORE_STR_OR_ATT,NGLOB_INNER_CORE, &
                NSPEC_INNER_CORE_ATTENUATION, &
@@ -136,7 +104,6 @@
                eps_trace_over_3_inner_core,&
                alphaval,betaval,gammaval, &
                factor_common_inner_core,size(factor_common_inner_core,5), .false. )
-
        else
           ! no Deville optimization
           ! crust/mantle region
@@ -166,72 +133,6 @@
                factor_common_inner_core,size(factor_common_inner_core,5) )
 
        endif
-
-       ! adjoint / kernel runs
-       if (SIMULATION_TYPE == 3 ) then
-          if( USE_DEVILLE_PRODUCTS_VAL ) then
-             ! uses Deville (2002) optimizations
-             ! crust/mantle region
-             call compute_forces_crust_mantle_Dev( NSPEC_CRUST_MANTLE_ADJOINT,NGLOB_CRUST_MANTLE_ADJOINT, &
-                  NSPEC_CRUST_MANTLE_STR_AND_ATT, &
-                  b_deltat, &
-                  b_displ_crust_mantle,b_veloc_crust_mantle,b_accel_crust_mantle, &
-                  phase_is_inner, &
-                  b_R_xx_crust_mantle,b_R_yy_crust_mantle,b_R_xy_crust_mantle, &
-                  b_R_xz_crust_mantle,b_R_yz_crust_mantle, &
-                  b_epsilondev_xx_crust_mantle,b_epsilondev_yy_crust_mantle,&
-                  b_epsilondev_xy_crust_mantle, &
-                  b_epsilondev_xz_crust_mantle,b_epsilondev_yz_crust_mantle, &
-                  b_eps_trace_over_3_crust_mantle, &
-                  b_alphaval,b_betaval,b_gammaval, &
-                  factor_common_crust_mantle,size(factor_common_crust_mantle,5), .true. )
-             ! inner core region
-             call compute_forces_inner_core_Dev( NSPEC_INNER_CORE_ADJOINT,NGLOB_INNER_CORE_ADJOINT, &
-                  NSPEC_INNER_CORE_STR_AND_ATT, &
-                  b_deltat, &
-                  b_displ_inner_core,b_veloc_inner_core,b_accel_inner_core, &
-                  phase_is_inner, &
-                  b_R_xx_inner_core,b_R_yy_inner_core,b_R_xy_inner_core, &
-                  b_R_xz_inner_core,b_R_yz_inner_core, &
-                  b_epsilondev_xx_inner_core,b_epsilondev_yy_inner_core,b_epsilondev_xy_inner_core, &
-                  b_epsilondev_xz_inner_core,b_epsilondev_yz_inner_core, &
-                  b_eps_trace_over_3_inner_core,&
-                  b_alphaval,b_betaval,b_gammaval, &
-                  factor_common_inner_core,size(factor_common_inner_core,5), .true. )
-
-          else
-             ! no Deville optimization
-             ! crust/mantle region
-             call compute_forces_crust_mantle( NSPEC_CRUST_MANTLE_ADJOINT,NGLOB_CRUST_MANTLE_ADJOINT, &
-                  NSPEC_CRUST_MANTLE_STR_AND_ATT, &
-                  b_deltat, &
-                  b_displ_crust_mantle,b_veloc_crust_mantle,b_accel_crust_mantle, &
-                  phase_is_inner, &
-                  b_R_xx_crust_mantle,b_R_yy_crust_mantle,b_R_xy_crust_mantle, &
-                  b_R_xz_crust_mantle,b_R_yz_crust_mantle, &
-                  b_epsilondev_xx_crust_mantle,b_epsilondev_yy_crust_mantle,&
-                  b_epsilondev_xy_crust_mantle, &
-                  b_epsilondev_xz_crust_mantle,b_epsilondev_yz_crust_mantle, &
-                  b_eps_trace_over_3_crust_mantle, &
-                  b_alphaval,b_betaval,b_gammaval, &
-                  factor_common_crust_mantle,size(factor_common_crust_mantle,5) )
-
-             ! inner core region
-             call compute_forces_inner_core( NSPEC_INNER_CORE_ADJOINT,NGLOB_INNER_CORE_ADJOINT, &
-                  NSPEC_INNER_CORE_STR_AND_ATT, &
-                  b_deltat, &
-                  b_displ_inner_core,b_veloc_inner_core,b_accel_inner_core, &
-                  phase_is_inner, &
-                  b_R_xx_inner_core,b_R_yy_inner_core,b_R_xy_inner_core, &
-                  b_R_xz_inner_core,b_R_yz_inner_core, &
-                  b_epsilondev_xx_inner_core,b_epsilondev_yy_inner_core,b_epsilondev_xy_inner_core, &
-                  b_epsilondev_xz_inner_core,b_epsilondev_yz_inner_core, &
-                  b_eps_trace_over_3_inner_core,&
-                  b_alphaval,b_betaval,b_gammaval, &
-                  factor_common_inner_core,size(factor_common_inner_core,5) )
-          endif
-       endif !SIMULATION_TYPE == 3
-
     else
        ! on GPU
        ! contains both forward SIM_TYPE==1 and backward SIM_TYPE==3 simulations
@@ -246,12 +147,12 @@
 
        ! absorbing boundaries
        ! Stacey
-       if(NCHUNKS_VAL /= 6 .and. ABSORBING_CONDITIONS) call compute_stacey_crust_mantle()
+       if(NCHUNKS_VAL /= 6 .and. ABSORBING_CONDITIONS) call compute_stacey_crust_mantle_forward()
 
        ! add the sources
 
        ! add adjoint sources
-       if (SIMULATION_TYPE == 2 .or. SIMULATION_TYPE == 3) then
+       if (SIMULATION_TYPE == 2 ) then
           if( nadj_rec_local > 0 ) call compute_add_sources_adjoint()
        endif
 
@@ -262,9 +163,6 @@
           ! adds sources for forward simulation
           if (SIMULATION_TYPE == 1 .and. nsources_local > 0) &
             call compute_add_sources()
-          ! add sources for backward/reconstructed wavefield
-          if (SIMULATION_TYPE == 3 .and. nsources_local > 0) &
-            call compute_add_sources_backward()
 
        case( 1 )
           ! the first step of noise tomography is to use |S(\omega)|^2 as a point force source at one of the receivers.
@@ -282,14 +180,6 @@
           ! note the ensemble forward sources are generally distributed on the surface of the earth
           ! that's to say, the ensemble forward source is kind of a surface force density, not a body force density
           ! therefore, we must add it here, before applying the inverse of mass matrix
-
-       case( 3 )
-          ! third step of noise tomography, i.e., read the surface movie saved at every timestep
-          ! use the movie to reconstruct the ensemble forward wavefield
-          ! the ensemble adjoint wavefield is done as usual
-          ! note instead of "NSTEP-it+1", now we us "it", since reconstruction is a reversal of reversal
-          call noise_read_add_surface_movie(NGLOB_CRUST_MANTLE_ADJOINT,b_accel_crust_mantle,it)
-
        end select
 
        ! ****************************************************
@@ -302,27 +192,25 @@
           !--- couple with outer core at the bottom of the mantle
           !---
           if(ACTUALLY_COUPLE_FLUID_CMB) &
-            call compute_coupling_CMB_fluid(displ_crust_mantle,b_displ_crust_mantle, &
-                                           accel_crust_mantle,b_accel_crust_mantle, &
-                                           ibool_crust_mantle,ibelm_bottom_crust_mantle,  &
-                                           accel_outer_core,b_accel_outer_core, &
-                                           normal_top_outer_core,jacobian2D_top_outer_core, &
-                                           wgllwgll_xy,ibool_outer_core,ibelm_top_outer_core, &
-                                           RHO_TOP_OC,minus_g_cmb, &
-                                           SIMULATION_TYPE,NSPEC2D_BOTTOM(IREGION_CRUST_MANTLE))
+            call compute_coupling_CMB_fluid(displ_crust_mantle,accel_crust_mantle, &
+                                            ibool_crust_mantle,ibelm_bottom_crust_mantle,  &
+                                            accel_outer_core, &
+                                            normal_top_outer_core,jacobian2D_top_outer_core, &
+                                            wgllwgll_xy,ibool_outer_core,ibelm_top_outer_core, &
+                                            RHO_TOP_OC,minus_g_cmb, &
+                                            NSPEC2D_BOTTOM(IREGION_CRUST_MANTLE))
 
           !---
           !--- couple with outer core at the top of the inner core
           !---
           if(ACTUALLY_COUPLE_FLUID_ICB) &
-            call compute_coupling_ICB_fluid(displ_inner_core,b_displ_inner_core, &
-                                           accel_inner_core,b_accel_inner_core, &
-                                           ibool_inner_core,ibelm_top_inner_core,  &
-                                           accel_outer_core,b_accel_outer_core, &
-                                           normal_bottom_outer_core,jacobian2D_bottom_outer_core, &
-                                           wgllwgll_xy,ibool_outer_core,ibelm_bottom_outer_core, &
-                                           RHO_BOTTOM_OC,minus_g_icb, &
-                                           SIMULATION_TYPE,NSPEC2D_TOP(IREGION_INNER_CORE))
+            call compute_coupling_ICB_fluid(displ_inner_core,accel_inner_core, &
+                                            ibool_inner_core,ibelm_top_inner_core,  &
+                                            accel_outer_core, &
+                                            normal_bottom_outer_core,jacobian2D_bottom_outer_core, &
+                                            wgllwgll_xy,ibool_outer_core,ibelm_bottom_outer_core, &
+                                            RHO_BOTTOM_OC,minus_g_icb, &
+                                            NSPEC2D_TOP(IREGION_INNER_CORE))
 
        else
           ! on GPU
@@ -330,12 +218,12 @@
           !--- couple with outer core at the bottom of the mantle
           !---
           if( ACTUALLY_COUPLE_FLUID_CMB ) &
-               call compute_coupling_cmb_fluid_cuda(Mesh_pointer)
+               call compute_coupling_cmb_fluid_cuda(Mesh_pointer,1)
           !---
           !--- couple with outer core at the top of the inner core
           !---
           if( ACTUALLY_COUPLE_FLUID_ICB ) &
-               call compute_coupling_icb_fluid_cuda(Mesh_pointer)
+               call compute_coupling_icb_fluid_cuda(Mesh_pointer,1)
 
        endif
     endif ! iphase == 1
@@ -387,51 +275,6 @@
                       IREGION_INNER_CORE, &
                       1)
       endif ! GPU_MODE
-
-      ! adjoint / kernel runs
-      if (SIMULATION_TYPE == 3) then
-        if(.NOT. GPU_MODE) then
-          ! on CPU
-          ! sends accel values to corresponding MPI interface neighbors
-          ! crust mantle
-          call assemble_MPI_vector_s(NPROCTOT_VAL,NGLOB_CRUST_MANTLE, &
-                        b_accel_crust_mantle, &
-                        b_buffer_send_vector_cm,b_buffer_recv_vector_cm, &
-                        num_interfaces_crust_mantle,max_nibool_interfaces_cm, &
-                        nibool_interfaces_crust_mantle,ibool_interfaces_crust_mantle,&
-                        my_neighbours_crust_mantle, &
-                        b_request_send_vector_cm,b_request_recv_vector_cm)
-          ! inner core
-          call assemble_MPI_vector_s(NPROCTOT_VAL,NGLOB_INNER_CORE, &
-                        b_accel_inner_core, &
-                        b_buffer_send_vector_inner_core,b_buffer_recv_vector_inner_core, &
-                        num_interfaces_inner_core,max_nibool_interfaces_ic, &
-                        nibool_interfaces_inner_core,ibool_interfaces_inner_core,&
-                        my_neighbours_inner_core, &
-                        b_request_send_vector_ic,b_request_recv_vector_ic)
-        else
-          ! on GPU
-          ! crust mantle
-          call assemble_MPI_vector_send_cuda(Mesh_pointer,NPROCTOT_VAL, &
-                      b_buffer_send_vector_cm,b_buffer_recv_vector_cm, &
-                      num_interfaces_crust_mantle,max_nibool_interfaces_cm, &
-                      nibool_interfaces_crust_mantle,&
-                      my_neighbours_crust_mantle, &
-                      b_request_send_vector_cm,b_request_recv_vector_cm, &
-                      IREGION_CRUST_MANTLE, &
-                      3) ! <-- 3 == adjoint b_accel
-          ! inner core
-          call assemble_MPI_vector_send_cuda(Mesh_pointer,NPROCTOT_VAL, &
-                      b_buffer_send_vector_inner_core,b_buffer_recv_vector_inner_core, &
-                      num_interfaces_inner_core,max_nibool_interfaces_ic, &
-                      nibool_interfaces_inner_core,&
-                      my_neighbours_inner_core, &
-                      b_request_send_vector_ic,b_request_recv_vector_ic, &
-                      IREGION_INNER_CORE, &
-                      3)
-        endif ! GPU
-      endif ! SIMULATION_TYPE == 3
-
     else
       ! waits for send/receive requests to be completed and assembles values
       if(.NOT. GPU_MODE) then
@@ -467,46 +310,6 @@
                             IREGION_INNER_CORE, &
                             1)
       endif
-
-
-      ! adjoint / kernel runs
-      if (SIMULATION_TYPE == 3) then
-        ! waits for send/receive requests to be completed and assembles values
-        if(.NOT. GPU_MODE) then
-          ! on CPU
-          ! crust mantle
-          call assemble_MPI_vector_w(NPROCTOT_VAL,NGLOB_CRUST_MANTLE, &
-                              b_accel_crust_mantle, &
-                              b_buffer_recv_vector_cm,num_interfaces_crust_mantle,&
-                              max_nibool_interfaces_cm, &
-                              nibool_interfaces_crust_mantle,ibool_interfaces_crust_mantle, &
-                              b_request_send_vector_cm,b_request_recv_vector_cm)
-          ! inner core
-          call assemble_MPI_vector_w(NPROCTOT_VAL,NGLOB_INNER_CORE, &
-                              b_accel_inner_core, &
-                              b_buffer_recv_vector_inner_core,num_interfaces_inner_core,&
-                              max_nibool_interfaces_ic, &
-                              nibool_interfaces_inner_core,ibool_interfaces_inner_core, &
-                              b_request_send_vector_ic,b_request_recv_vector_ic)
-
-        else
-          ! on GPU
-          ! crust mantle
-          call assemble_MPI_vector_write_cuda(Mesh_pointer,NPROCTOT_VAL, &
-                            b_buffer_recv_vector_cm, &
-                            num_interfaces_crust_mantle,max_nibool_interfaces_cm, &
-                            b_request_send_vector_cm,b_request_recv_vector_cm, &
-                            IREGION_CRUST_MANTLE, &
-                            3) ! <-- 3 == adjoint b_accel
-          ! inner core
-          call assemble_MPI_vector_write_cuda(Mesh_pointer,NPROCTOT_VAL,&
-                            b_buffer_recv_vector_inner_core, &
-                            num_interfaces_inner_core,max_nibool_interfaces_ic, &
-                            b_request_send_vector_ic,b_request_recv_vector_ic, &
-                            IREGION_INNER_CORE, &
-                            3)
-        endif
-      endif ! SIMULATION_TYPE == 3
     endif ! iphase == 1
 
   enddo ! iphase
@@ -515,19 +318,11 @@
   if(.NOT. GPU_MODE) then
      ! on CPU
      call update_accel_elastic(NGLOB_CRUST_MANTLE,NGLOB_XY_CM,veloc_crust_mantle,accel_crust_mantle, &
-                               two_omega_earth, &
-                               rmassx_crust_mantle,rmassy_crust_mantle,rmassz_crust_mantle, &
-                               NCHUNKS_VAL,ABSORBING_CONDITIONS)
-     ! adjoint / kernel runs
-     if (SIMULATION_TYPE == 3) &
-          call update_accel_elastic(NGLOB_CRUST_MANTLE_ADJOINT,NGLOB_XY_CM,b_veloc_crust_mantle,b_accel_crust_mantle, &
-                                    b_two_omega_earth, &
-                                    rmassx_crust_mantle,rmassy_crust_mantle,rmassz_crust_mantle, &
-                                    NCHUNKS_VAL,ABSORBING_CONDITIONS)
+                               two_omega_earth,rmassx_crust_mantle,rmassy_crust_mantle,rmassz_crust_mantle)
   else
      ! on GPU
-     call kernel_3_a_cuda(Mesh_pointer, &
-                         deltatover2,SIMULATION_TYPE,b_deltatover2,NCHUNKS_VAL)
+     ! includes FORWARD_OR_ADJOINT == 1
+     call update_accel_3_a_cuda(Mesh_pointer,deltatover2,NCHUNKS_VAL,1)
   endif
 
   ! couples ocean with crust mantle
@@ -540,9 +335,7 @@
                                   rmass_ocean_load,normal_top_crust_mantle, &
                                   ibool_crust_mantle,ibelm_top_crust_mantle, &
                                   updated_dof_ocean_load,NGLOB_XY_CM, &
-                                  NSPEC2D_TOP(IREGION_CRUST_MANTLE), &
-                                  ABSORBING_CONDITIONS,EXACT_MASS_MATRIX_FOR_ROTATION,USE_LDDRK)
-
+                                  NSPEC2D_TOP(IREGION_CRUST_MANTLE) )
     else
       ! on GPU
       call compute_coupling_ocean_cuda(Mesh_pointer,NCHUNKS_VAL,EXACT_MASS_MATRIX_FOR_ROTATION,USE_LDDRK, &
@@ -558,15 +351,376 @@
     call update_veloc_elastic(NGLOB_CRUST_MANTLE,veloc_crust_mantle,accel_crust_mantle, &
                               NGLOB_INNER_CORE,veloc_inner_core,accel_inner_core, &
                               deltatover2,two_omega_earth,rmass_inner_core)
+  else
+    ! on GPU
+    ! includes FORWARD_OR_ADJOINT == 1
+    call update_veloc_3_b_cuda(Mesh_pointer,deltatover2,1)
+  endif
+
+  end subroutine compute_forces_viscoelastic
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+  subroutine compute_forces_viscoelastic_backward()
+
+! backward/reconstructed wavefields only
+
+  use specfem_par
+  use specfem_par_crustmantle
+  use specfem_par_innercore
+  use specfem_par_outercore,only: b_accel_outer_core, &
+                                  normal_top_outer_core,jacobian2D_top_outer_core, &
+                                  normal_bottom_outer_core,jacobian2D_bottom_outer_core, &
+                                  ibelm_top_outer_core,ibelm_bottom_outer_core, &
+                                  ibool_outer_core
+  use specfem_par_movie
+  implicit none
+
+  ! local parameters
+  ! non blocking MPI
+  ! iphase: iphase = 1 is for computing outer elements in the crust_mantle and inner_core regions,
+  !              iphase = 2 is for computing inner elements (former icall parameter)
+  integer :: iphase
+  logical :: phase_is_inner
+
+  ! checks
+  if( SIMULATION_TYPE /= 3 ) return
+
+!daniel debug: att - debug
+!  integer :: iglob
+!  logical,parameter :: DEBUG = .false.
+!  if( DEBUG ) then
+!    iglob = ibool_crust_mantle(1,1,1,100)
+!    if( SIMULATION_TYPE == 1) then
+!      if( it == NSTEP .and. myrank == 0 ) then
+!        print*,'last step',it,'Rxx:',R_xx_crust_mantle(1,1,1,1,100),epsilondev_xx_crust_mantle(1,1,1,100), &
+!          displ_crust_mantle(1,iglob),displ_crust_mantle(2,iglob),displ_crust_mantle(3,iglob)
+!      endif
+!      if( it == NSTEP-1 .and. myrank == 0 ) then
+!        print*,'second last step',it,'Rxx:',R_xx_crust_mantle(1,1,1,1,100),epsilondev_xx_crust_mantle(1,1,1,100), &
+!          displ_crust_mantle(1,iglob),displ_crust_mantle(2,iglob),displ_crust_mantle(3,iglob)
+!      endif
+!      if( it == NSTEP-2 .and. myrank == 0 ) then
+!        print*,'third last step',it,'Rxx:',R_xx_crust_mantle(1,1,1,1,100),epsilondev_xx_crust_mantle(1,1,1,100), &
+!          displ_crust_mantle(1,iglob),displ_crust_mantle(2,iglob),displ_crust_mantle(3,iglob)
+!      endif
+!    else if( SIMULATION_TYPE == 3 ) then
+!      if( it == 1 .and. myrank == 0 ) then
+!        print*,'first step',it,'Rxx:',b_R_xx_crust_mantle(1,1,1,1,100),b_epsilondev_xx_crust_mantle(1,1,1,100), &
+!          b_displ_crust_mantle(1,iglob),b_displ_crust_mantle(2,iglob),b_displ_crust_mantle(3,iglob)
+!      endif
+!      if( it == 2 .and. myrank == 0 ) then
+!        print*,'second step',it,'Rxx:',b_R_xx_crust_mantle(1,1,1,1,100),b_epsilondev_xx_crust_mantle(1,1,1,100), &
+!          b_displ_crust_mantle(1,iglob),b_displ_crust_mantle(2,iglob),b_displ_crust_mantle(3,iglob)
+!      endif
+!      if( it == 3 .and. myrank == 0 ) then
+!        print*,'third step',it,'Rxx:',b_R_xx_crust_mantle(1,1,1,1,100),b_epsilondev_xx_crust_mantle(1,1,1,100), &
+!          b_displ_crust_mantle(1,iglob),b_displ_crust_mantle(2,iglob),b_displ_crust_mantle(3,iglob)
+!      endif
+!    endif
+!  endif
+
+  ! ****************************************************
+  !   big loop over all spectral elements in the solid
+  ! ****************************************************
+
+  ! compute internal forces in the solid regions
+
+  ! for anisotropy and gravity, x y and z contain r theta and phi
+
+  ! distinguishes two runs: for elements on MPI interfaces, and elements within the partitions
+  do iphase = 1,2
+
+    ! first, iphase == 1 for points on MPI interfaces (thus outer elements)
+    ! second, iphase == 2 for points purely inside partition (thus inner elements)
+    !
+    ! compute all the outer elements first, then sends out non blocking MPI communication
+    ! and continues computing inner elements (overlapping)
+    if( iphase == 1 ) then
+      phase_is_inner = .false.
+    else
+      phase_is_inner = .true.
+    endif
+
+    ! compute internal forces in the solid regions
+    ! note: for anisotropy and gravity, x y and z contain r theta and phi
+    if( .NOT. GPU_MODE ) then
+      ! on CPU
+      ! adjoint / kernel runs
+      if( USE_DEVILLE_PRODUCTS_VAL ) then
+        ! uses Deville (2002) optimizations
+        ! crust/mantle region
+        call compute_forces_crust_mantle_Dev( NSPEC_CRUST_MANTLE_ADJOINT,NGLOB_CRUST_MANTLE_ADJOINT, &
+              NSPEC_CRUST_MANTLE_STR_AND_ATT, &
+              b_deltat, &
+              b_displ_crust_mantle,b_veloc_crust_mantle,b_accel_crust_mantle, &
+              phase_is_inner, &
+              b_R_xx_crust_mantle,b_R_yy_crust_mantle,b_R_xy_crust_mantle, &
+              b_R_xz_crust_mantle,b_R_yz_crust_mantle, &
+              b_epsilondev_xx_crust_mantle,b_epsilondev_yy_crust_mantle,&
+              b_epsilondev_xy_crust_mantle, &
+              b_epsilondev_xz_crust_mantle,b_epsilondev_yz_crust_mantle, &
+              b_eps_trace_over_3_crust_mantle, &
+              b_alphaval,b_betaval,b_gammaval, &
+              factor_common_crust_mantle,size(factor_common_crust_mantle,5), .true. )
+        ! inner core region
+        call compute_forces_inner_core_Dev( NSPEC_INNER_CORE_ADJOINT,NGLOB_INNER_CORE_ADJOINT, &
+              NSPEC_INNER_CORE_STR_AND_ATT, &
+              b_deltat, &
+              b_displ_inner_core,b_veloc_inner_core,b_accel_inner_core, &
+              phase_is_inner, &
+              b_R_xx_inner_core,b_R_yy_inner_core,b_R_xy_inner_core, &
+              b_R_xz_inner_core,b_R_yz_inner_core, &
+              b_epsilondev_xx_inner_core,b_epsilondev_yy_inner_core,b_epsilondev_xy_inner_core, &
+              b_epsilondev_xz_inner_core,b_epsilondev_yz_inner_core, &
+              b_eps_trace_over_3_inner_core,&
+              b_alphaval,b_betaval,b_gammaval, &
+              factor_common_inner_core,size(factor_common_inner_core,5), .true. )
+
+      else
+        ! no Deville optimization
+        ! crust/mantle region
+        call compute_forces_crust_mantle( NSPEC_CRUST_MANTLE_ADJOINT,NGLOB_CRUST_MANTLE_ADJOINT, &
+              NSPEC_CRUST_MANTLE_STR_AND_ATT, &
+              b_deltat, &
+              b_displ_crust_mantle,b_veloc_crust_mantle,b_accel_crust_mantle, &
+              phase_is_inner, &
+              b_R_xx_crust_mantle,b_R_yy_crust_mantle,b_R_xy_crust_mantle, &
+              b_R_xz_crust_mantle,b_R_yz_crust_mantle, &
+              b_epsilondev_xx_crust_mantle,b_epsilondev_yy_crust_mantle,&
+              b_epsilondev_xy_crust_mantle, &
+              b_epsilondev_xz_crust_mantle,b_epsilondev_yz_crust_mantle, &
+              b_eps_trace_over_3_crust_mantle, &
+              b_alphaval,b_betaval,b_gammaval, &
+              factor_common_crust_mantle,size(factor_common_crust_mantle,5) )
+
+        ! inner core region
+        call compute_forces_inner_core( NSPEC_INNER_CORE_ADJOINT,NGLOB_INNER_CORE_ADJOINT, &
+              NSPEC_INNER_CORE_STR_AND_ATT, &
+              b_deltat, &
+              b_displ_inner_core,b_veloc_inner_core,b_accel_inner_core, &
+              phase_is_inner, &
+              b_R_xx_inner_core,b_R_yy_inner_core,b_R_xy_inner_core, &
+              b_R_xz_inner_core,b_R_yz_inner_core, &
+              b_epsilondev_xx_inner_core,b_epsilondev_yy_inner_core,b_epsilondev_xy_inner_core, &
+              b_epsilondev_xz_inner_core,b_epsilondev_yz_inner_core, &
+              b_eps_trace_over_3_inner_core,&
+              b_alphaval,b_betaval,b_gammaval, &
+              factor_common_inner_core,size(factor_common_inner_core,5) )
+      endif
+    else
+      ! on GPU
+      ! contains both forward SIM_TYPE==1 and backward SIM_TYPE==3 simulations
+      ! for crust/mantle
+      call compute_forces_crust_mantle_cuda(Mesh_pointer,iphase)
+      ! for inner core
+      call compute_forces_inner_core_cuda(Mesh_pointer,iphase)
+    endif ! GPU_MODE
+
+    ! computes additional contributions to acceleration field
+    if( iphase == 1 ) then
+
+       ! absorbing boundaries
+       ! Stacey
+       if(NCHUNKS_VAL /= 6 .and. ABSORBING_CONDITIONS) call compute_stacey_crust_mantle_backward()
+
+       ! add the sources
+       select case( NOISE_TOMOGRAPHY )
+       case( 0 )
+          ! add sources for backward/reconstructed wavefield
+          if( nsources_local > 0 ) &
+            call compute_add_sources_backward()
+
+       case( 3 )
+          ! third step of noise tomography, i.e., read the surface movie saved at every timestep
+          ! use the movie to reconstruct the ensemble forward wavefield
+          ! the ensemble adjoint wavefield is done as usual
+          ! note instead of "NSTEP-it+1", now we us "it", since reconstruction is a reversal of reversal
+          call noise_read_add_surface_movie(NGLOB_CRUST_MANTLE_ADJOINT,b_accel_crust_mantle,it)
+
+       end select
+
+       ! ****************************************************
+       ! **********  add matching with fluid part  **********
+       ! ****************************************************
+       ! only for elements in first matching layer in the solid
+       if( .not. GPU_MODE ) then
+          ! on CPU
+          !---
+          !--- couple with outer core at the bottom of the mantle
+          !---
+          if(ACTUALLY_COUPLE_FLUID_CMB) &
+            call compute_coupling_CMB_fluid(b_displ_crust_mantle,b_accel_crust_mantle, &
+                                            ibool_crust_mantle,ibelm_bottom_crust_mantle,  &
+                                            b_accel_outer_core, &
+                                            normal_top_outer_core,jacobian2D_top_outer_core, &
+                                            wgllwgll_xy,ibool_outer_core,ibelm_top_outer_core, &
+                                            RHO_TOP_OC,minus_g_cmb, &
+                                            NSPEC2D_BOTTOM(IREGION_CRUST_MANTLE))
+
+          !---
+          !--- couple with outer core at the top of the inner core
+          !---
+          if(ACTUALLY_COUPLE_FLUID_ICB) &
+            call compute_coupling_ICB_fluid(b_displ_inner_core,b_accel_inner_core, &
+                                            ibool_inner_core,ibelm_top_inner_core,  &
+                                            b_accel_outer_core, &
+                                            normal_bottom_outer_core,jacobian2D_bottom_outer_core, &
+                                            wgllwgll_xy,ibool_outer_core,ibelm_bottom_outer_core, &
+                                            RHO_BOTTOM_OC,minus_g_icb, &
+                                            NSPEC2D_TOP(IREGION_INNER_CORE))
+
+       else
+          ! on GPU
+          !---
+          !--- couple with outer core at the bottom of the mantle
+          !---
+          if( ACTUALLY_COUPLE_FLUID_CMB ) &
+               call compute_coupling_cmb_fluid_cuda(Mesh_pointer,3)
+          !---
+          !--- couple with outer core at the top of the inner core
+          !---
+          if( ACTUALLY_COUPLE_FLUID_ICB ) &
+               call compute_coupling_icb_fluid_cuda(Mesh_pointer,3)
+
+       endif
+    endif ! iphase == 1
+
+    ! assemble all the contributions between slices using MPI
+
+    ! crust/mantle and inner core handled in the same call
+    ! in order to reduce the number of MPI messages by 2
+
+    if( iphase == 1 ) then
+      ! sends out MPI interface data
+      ! adjoint / kernel runs
+      if(.NOT. GPU_MODE) then
+        ! on CPU
+        ! sends accel values to corresponding MPI interface neighbors
+        ! crust mantle
+        call assemble_MPI_vector_s(NPROCTOT_VAL,NGLOB_CRUST_MANTLE, &
+                      b_accel_crust_mantle, &
+                      b_buffer_send_vector_cm,b_buffer_recv_vector_cm, &
+                      num_interfaces_crust_mantle,max_nibool_interfaces_cm, &
+                      nibool_interfaces_crust_mantle,ibool_interfaces_crust_mantle,&
+                      my_neighbours_crust_mantle, &
+                      b_request_send_vector_cm,b_request_recv_vector_cm)
+        ! inner core
+        call assemble_MPI_vector_s(NPROCTOT_VAL,NGLOB_INNER_CORE, &
+                      b_accel_inner_core, &
+                      b_buffer_send_vector_inner_core,b_buffer_recv_vector_inner_core, &
+                      num_interfaces_inner_core,max_nibool_interfaces_ic, &
+                      nibool_interfaces_inner_core,ibool_interfaces_inner_core,&
+                      my_neighbours_inner_core, &
+                      b_request_send_vector_ic,b_request_recv_vector_ic)
+      else
+        ! on GPU
+        ! crust mantle
+        call assemble_MPI_vector_send_cuda(Mesh_pointer,NPROCTOT_VAL, &
+                    b_buffer_send_vector_cm,b_buffer_recv_vector_cm, &
+                    num_interfaces_crust_mantle,max_nibool_interfaces_cm, &
+                    nibool_interfaces_crust_mantle,&
+                    my_neighbours_crust_mantle, &
+                    b_request_send_vector_cm,b_request_recv_vector_cm, &
+                    IREGION_CRUST_MANTLE, &
+                    3) ! <-- 3 == adjoint b_accel
+        ! inner core
+        call assemble_MPI_vector_send_cuda(Mesh_pointer,NPROCTOT_VAL, &
+                    b_buffer_send_vector_inner_core,b_buffer_recv_vector_inner_core, &
+                    num_interfaces_inner_core,max_nibool_interfaces_ic, &
+                    nibool_interfaces_inner_core,&
+                    my_neighbours_inner_core, &
+                    b_request_send_vector_ic,b_request_recv_vector_ic, &
+                    IREGION_INNER_CORE, &
+                    3)
+      endif ! GPU
+    else
+      ! waits for send/receive requests to be completed and assembles values
+      ! adjoint / kernel runs
+      ! waits for send/receive requests to be completed and assembles values
+      if(.NOT. GPU_MODE) then
+        ! on CPU
+        ! crust mantle
+        call assemble_MPI_vector_w(NPROCTOT_VAL,NGLOB_CRUST_MANTLE, &
+                            b_accel_crust_mantle, &
+                            b_buffer_recv_vector_cm,num_interfaces_crust_mantle,&
+                            max_nibool_interfaces_cm, &
+                            nibool_interfaces_crust_mantle,ibool_interfaces_crust_mantle, &
+                            b_request_send_vector_cm,b_request_recv_vector_cm)
+        ! inner core
+        call assemble_MPI_vector_w(NPROCTOT_VAL,NGLOB_INNER_CORE, &
+                            b_accel_inner_core, &
+                            b_buffer_recv_vector_inner_core,num_interfaces_inner_core,&
+                            max_nibool_interfaces_ic, &
+                            nibool_interfaces_inner_core,ibool_interfaces_inner_core, &
+                            b_request_send_vector_ic,b_request_recv_vector_ic)
+
+      else
+        ! on GPU
+        ! crust mantle
+        call assemble_MPI_vector_write_cuda(Mesh_pointer,NPROCTOT_VAL, &
+                          b_buffer_recv_vector_cm, &
+                          num_interfaces_crust_mantle,max_nibool_interfaces_cm, &
+                          b_request_send_vector_cm,b_request_recv_vector_cm, &
+                          IREGION_CRUST_MANTLE, &
+                          3) ! <-- 3 == adjoint b_accel
+        ! inner core
+        call assemble_MPI_vector_write_cuda(Mesh_pointer,NPROCTOT_VAL,&
+                          b_buffer_recv_vector_inner_core, &
+                          num_interfaces_inner_core,max_nibool_interfaces_ic, &
+                          b_request_send_vector_ic,b_request_recv_vector_ic, &
+                          IREGION_INNER_CORE, &
+                          3)
+      endif
+    endif ! iphase == 1
+
+  enddo ! iphase
+
+  ! updates (only) acceleration w/ rotation in the crust/mantle region (touches oceans)
+  if(.NOT. GPU_MODE) then
+    ! on CPU
     ! adjoint / kernel runs
-    if (SIMULATION_TYPE == 3) &
-      call update_veloc_elastic(NGLOB_CRUST_MANTLE_ADJOINT,b_veloc_crust_mantle,b_accel_crust_mantle, &
-                                NGLOB_INNER_CORE_ADJOINT,b_veloc_inner_core,b_accel_inner_core, &
-                                b_deltatover2,b_two_omega_earth,rmass_inner_core)
+    call update_accel_elastic(NGLOB_CRUST_MANTLE_ADJOINT,NGLOB_XY_CM,b_veloc_crust_mantle,b_accel_crust_mantle, &
+                              b_two_omega_earth,rmassx_crust_mantle,rmassy_crust_mantle,rmassz_crust_mantle)
   else
+     ! on GPU
+     ! includes FORWARD_OR_ADJOINT == 3
+     call update_accel_3_a_cuda(Mesh_pointer,b_deltatover2,NCHUNKS_VAL,3)
+  endif
+
+  ! couples ocean with crust mantle
+  ! (updates acceleration with ocean load approximation)
+  if( OCEANS_VAL ) then
+    if(.NOT. GPU_MODE) then
+      ! on CPU
+      call compute_coupling_ocean(b_accel_crust_mantle, &
+                                  rmassx_crust_mantle, rmassy_crust_mantle, rmassz_crust_mantle, &
+                                  rmass_ocean_load,normal_top_crust_mantle, &
+                                  ibool_crust_mantle,ibelm_top_crust_mantle, &
+                                  updated_dof_ocean_load,NGLOB_XY_CM, &
+                                  NSPEC2D_TOP(IREGION_CRUST_MANTLE) )
+    else
+      ! on GPU
+      call compute_coupling_ocean_cuda(Mesh_pointer,NCHUNKS_VAL,EXACT_MASS_MATRIX_FOR_ROTATION,USE_LDDRK, &
+                                       3) ! <- 3 == backward/reconstructed arrays
+    endif
+  endif
+
+  ! Newmark time scheme:
+  ! corrector terms for elastic parts
+  ! (updates velocity)
+  if(.NOT. GPU_MODE ) then
+    ! on CPU
+    ! adjoint / kernel runs
+    call update_veloc_elastic(NGLOB_CRUST_MANTLE_ADJOINT,b_veloc_crust_mantle,b_accel_crust_mantle, &
+                              NGLOB_INNER_CORE_ADJOINT,b_veloc_inner_core,b_accel_inner_core, &
+                              b_deltatover2,b_two_omega_earth,rmass_inner_core)
+  else
     ! on GPU
-    call kernel_3_b_cuda(Mesh_pointer, &
-                        deltatover2,SIMULATION_TYPE,b_deltatover2)
+    ! includes FORWARD_OR_ADJOINT == 3
+    call update_veloc_3_b_cuda(Mesh_pointer,b_deltatover2,3)
   endif
 
 
@@ -583,5 +737,5 @@
 !    endif
 !  endif
 
-  end subroutine compute_forces_viscoelastic
+  end subroutine compute_forces_viscoelastic_backward
 

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_kernels.F90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_kernels.F90	2013-08-30 00:30:39 UTC (rev 22745)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_kernels.F90	2013-08-30 12:45:44 UTC (rev 22746)
@@ -310,8 +310,15 @@
             vector_accel(3) = xizl*tempx1l + etazl*tempx2l + gammazl*tempx3l
 
             ! density kernel
+!            rho_kl_outer_core(i,j,k,ispec) = rho_kl_outer_core(i,j,k,ispec) &
+!              + deltat * dot_product(vector_accel(:), b_vector_displ_outer_core(:,iglob))
+
+!! DK DK July 2013: replaces dot_product() with an unrolled expression, otherwise most compilers
+!! DK DK July 2013: will try to vectorize this rather than the outer loop, resulting in a much slower code
             rho_kl_outer_core(i,j,k,ispec) = rho_kl_outer_core(i,j,k,ispec) &
-               + deltat * dot_product(vector_accel(:), b_vector_displ_outer_core(:,iglob))
+              + deltat * (  vector_accel(1) * b_vector_displ_outer_core(1,iglob) &
+                          + vector_accel(2) * b_vector_displ_outer_core(2,iglob) &
+                          + vector_accel(3) * b_vector_displ_outer_core(3,iglob) )
 
             ! bulk modulus kernel
             kappal = rhostore_outer_core(i,j,k,ispec)/kappavstore_outer_core(i,j,k,ispec)

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_stacey_crust_mantle.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_stacey_crust_mantle.f90	2013-08-30 00:30:39 UTC (rev 22745)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_stacey_crust_mantle.f90	2013-08-30 12:45:44 UTC (rev 22746)
@@ -25,18 +25,20 @@
 !
 !=====================================================================
 
-  subroutine compute_stacey_crust_mantle()
+  subroutine compute_stacey_crust_mantle_forward()
 
+! stacey conditions for forward or adjoint wavefields (SIMULATION_TYPE == 1 or 2)
+
   use constants_solver
 
   use specfem_par,only: &
-    ichunk,SIMULATION_TYPE,SAVE_FORWARD,NSTEP,it, &
+    ichunk,SIMULATION_TYPE,SAVE_FORWARD,it, &
     wgllwgll_xz,wgllwgll_yz
 
   use specfem_par,only: GPU_MODE,Mesh_pointer
 
   use specfem_par_crustmantle, only: &
-    veloc_crust_mantle,accel_crust_mantle,b_accel_crust_mantle, &
+    veloc_crust_mantle,accel_crust_mantle, &
     ibool_crust_mantle, &
     jacobian2D_xmin_crust_mantle,jacobian2D_xmax_crust_mantle, &
     jacobian2D_ymin_crust_mantle,jacobian2D_ymax_crust_mantle, &
@@ -71,6 +73,8 @@
   !           file access (by process rank modulo 8) showed that the following,
   !           simple approach is still fastest. (assuming that files are accessed on a local scratch disk)
 
+  ! checks
+  if( SIMULATION_TYPE == 3 ) return
 
   ! crust & mantle
 
@@ -78,13 +82,6 @@
   ! if two chunks exclude this face for one of them
   if(NCHUNKS_VAL == 1 .or. ichunk == CHUNK_AC) then
 
-    ! reads absorbing boundary values
-    if (SIMULATION_TYPE == 3 .and. nspec2D_xmin_crust_mantle > 0)  then
-      ! note: backward/reconstructed wavefields are read in after the Newmark time scheme in the first time loop
-      !          this leads to a corresponding boundary condition at time index NSTEP - (it-1) = NSTEP - it + 1
-      call read_abs(0,absorb_xmin_crust_mantle,reclen_xmin_crust_mantle,NSTEP-it+1)
-    endif
-
     if ( .NOT. GPU_MODE) then
       ! on CPU
       do ispec2D=1,nspec2D_xmin_crust_mantle
@@ -119,9 +116,7 @@
             accel_crust_mantle(2,iglob)=accel_crust_mantle(2,iglob) - ty*weight
             accel_crust_mantle(3,iglob)=accel_crust_mantle(3,iglob) - tz*weight
 
-            if (SIMULATION_TYPE == 3) then
-              b_accel_crust_mantle(:,iglob)=b_accel_crust_mantle(:,iglob) - absorb_xmin_crust_mantle(:,j,k,ispec2D)
-            else if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD) then
+            if( SAVE_FORWARD ) then
               absorb_xmin_crust_mantle(1,j,k,ispec2D) = tx*weight
               absorb_xmin_crust_mantle(2,j,k,ispec2D) = ty*weight
               absorb_xmin_crust_mantle(3,j,k,ispec2D) = tz*weight
@@ -138,7 +133,7 @@
     endif
 
     ! writes absorbing boundary values
-    if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD .and. nspec2D_xmin_crust_mantle > 0 ) then
+    if( SAVE_FORWARD .and. nspec2D_xmin_crust_mantle > 0 ) then
       call write_abs(0,absorb_xmin_crust_mantle, reclen_xmin_crust_mantle,it)
     endif
 
@@ -148,11 +143,6 @@
   ! if two chunks exclude this face for one of them
   if(NCHUNKS_VAL == 1 .or. ichunk == CHUNK_AB) then
 
-    ! reads absorbing boundary values
-    if (SIMULATION_TYPE == 3 .and. nspec2D_xmax_crust_mantle > 0)  then
-      call read_abs(1,absorb_xmax_crust_mantle,reclen_xmax_crust_mantle,NSTEP-it+1)
-    endif
-
     if(.NOT. GPU_MODE ) then
       ! on CPU
       do ispec2D=1,nspec2D_xmax_crust_mantle
@@ -187,9 +177,7 @@
             accel_crust_mantle(2,iglob)=accel_crust_mantle(2,iglob) - ty*weight
             accel_crust_mantle(3,iglob)=accel_crust_mantle(3,iglob) - tz*weight
 
-            if (SIMULATION_TYPE == 3) then
-              b_accel_crust_mantle(:,iglob)=b_accel_crust_mantle(:,iglob) - absorb_xmax_crust_mantle(:,j,k,ispec2D)
-            else if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD) then
+            if( SAVE_FORWARD ) then
               absorb_xmax_crust_mantle(1,j,k,ispec2D) = tx*weight
               absorb_xmax_crust_mantle(2,j,k,ispec2D) = ty*weight
               absorb_xmax_crust_mantle(3,j,k,ispec2D) = tz*weight
@@ -208,7 +196,7 @@
 
 
     ! writes absorbing boundary values
-    if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD .and. nspec2D_xmax_crust_mantle > 0 ) then
+    if( SAVE_FORWARD .and. nspec2D_xmax_crust_mantle > 0 ) then
       call write_abs(1,absorb_xmax_crust_mantle,reclen_xmax_crust_mantle,it)
     endif
 
@@ -216,11 +204,6 @@
 
   !   ymin
 
-  ! reads absorbing boundary values
-  if (SIMULATION_TYPE == 3 .and. nspec2D_ymin_crust_mantle > 0)  then
-    call read_abs(2,absorb_ymin_crust_mantle, reclen_ymin_crust_mantle,NSTEP-it+1)
-  endif
-
   if( .NOT. GPU_MODE ) then
     ! on CPU
     do ispec2D=1,nspec2D_ymin_crust_mantle
@@ -255,9 +238,7 @@
           accel_crust_mantle(2,iglob)=accel_crust_mantle(2,iglob) - ty*weight
           accel_crust_mantle(3,iglob)=accel_crust_mantle(3,iglob) - tz*weight
 
-          if (SIMULATION_TYPE == 3) then
-            b_accel_crust_mantle(:,iglob)=b_accel_crust_mantle(:,iglob) - absorb_ymin_crust_mantle(:,i,k,ispec2D)
-          else if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD) then
+          if( SAVE_FORWARD ) then
             absorb_ymin_crust_mantle(1,i,k,ispec2D) = tx*weight
             absorb_ymin_crust_mantle(2,i,k,ispec2D) = ty*weight
             absorb_ymin_crust_mantle(3,i,k,ispec2D) = tz*weight
@@ -275,17 +256,12 @@
   endif
 
   ! writes absorbing boundary values
-  if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD .and. nspec2D_ymin_crust_mantle > 0 ) then
+  if( SAVE_FORWARD .and. nspec2D_ymin_crust_mantle > 0 ) then
     call write_abs(2,absorb_ymin_crust_mantle,reclen_ymin_crust_mantle,it)
   endif
 
   !   ymax
 
-  ! reads absorbing boundary values
-  if (SIMULATION_TYPE == 3 .and. nspec2D_ymax_crust_mantle > 0)  then
-    call read_abs(3,absorb_ymax_crust_mantle,reclen_ymax_crust_mantle,NSTEP-it+1)
-  endif
-
   if( .NOT. GPU_MODE ) then
     ! on CPU
     do ispec2D=1,nspec2D_ymax_crust_mantle
@@ -320,9 +296,7 @@
           accel_crust_mantle(2,iglob)=accel_crust_mantle(2,iglob) - ty*weight
           accel_crust_mantle(3,iglob)=accel_crust_mantle(3,iglob) - tz*weight
 
-          if (SIMULATION_TYPE == 3) then
-            b_accel_crust_mantle(:,iglob)=b_accel_crust_mantle(:,iglob) - absorb_ymax_crust_mantle(:,i,k,ispec2D)
-          else if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD) then
+          if( SAVE_FORWARD ) then
             absorb_ymax_crust_mantle(1,i,k,ispec2D) = tx*weight
             absorb_ymax_crust_mantle(2,i,k,ispec2D) = ty*weight
             absorb_ymax_crust_mantle(3,i,k,ispec2D) = tz*weight
@@ -340,9 +314,202 @@
   endif
 
   ! writes absorbing boundary values
-  if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD .and. nspec2D_ymax_crust_mantle > 0 ) then
+  if( SAVE_FORWARD .and. nspec2D_ymax_crust_mantle > 0 ) then
     call write_abs(3,absorb_ymax_crust_mantle,reclen_ymax_crust_mantle,it)
   endif
 
-  end subroutine compute_stacey_crust_mantle
+  end subroutine compute_stacey_crust_mantle_forward
 
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+  subroutine compute_stacey_crust_mantle_backward()
+
+! stacey for backward/reconstructed wavefield
+
+  use constants_solver
+
+  use specfem_par,only: &
+    ichunk,SIMULATION_TYPE,NSTEP,it
+
+  use specfem_par,only: GPU_MODE,Mesh_pointer
+
+  use specfem_par_crustmantle, only: &
+    b_accel_crust_mantle, &
+    ibool_crust_mantle, &
+    ibelm_xmin_crust_mantle,ibelm_xmax_crust_mantle, &
+    ibelm_ymin_crust_mantle,ibelm_ymax_crust_mantle, &
+    nimin_crust_mantle,nimax_crust_mantle, &
+    njmin_crust_mantle,njmax_crust_mantle, &
+    nkmin_xi_crust_mantle,nkmin_eta_crust_mantle, &
+    nspec2D_xmin_crust_mantle,nspec2D_xmax_crust_mantle, &
+    nspec2D_ymin_crust_mantle,nspec2D_ymax_crust_mantle, &
+    reclen_xmin_crust_mantle,reclen_xmax_crust_mantle, &
+    reclen_ymin_crust_mantle,reclen_ymax_crust_mantle, &
+    absorb_xmin_crust_mantle,absorb_xmax_crust_mantle, &
+    absorb_ymin_crust_mantle,absorb_ymax_crust_mantle
+
+  implicit none
+
+  ! local parameters
+  integer :: i,j,k,ispec,iglob,ispec2D
+
+  ! note: we use C functions for I/O as they still have a better performance than
+  !           Fortran, unformatted file I/O. however, using -assume byterecl together with Fortran functions
+  !           comes very close (only  ~ 4 % slower ).
+  !
+  !           tests with intermediate storages (every 8 step) and/or asynchronious
+  !           file access (by process rank modulo 8) showed that the following,
+  !           simple approach is still fastest. (assuming that files are accessed on a local scratch disk)
+
+  ! checks
+  if (SIMULATION_TYPE /= 3 ) return
+
+  ! crust & mantle
+
+  !   xmin
+  ! if two chunks exclude this face for one of them
+  if(NCHUNKS_VAL == 1 .or. ichunk == CHUNK_AC) then
+
+    ! reads absorbing boundary values
+    if( nspec2D_xmin_crust_mantle > 0 )  then
+      ! note: backward/reconstructed wavefields are read in after the Newmark time scheme in the first time loop
+      !          this leads to a corresponding boundary condition at time index NSTEP - (it-1) = NSTEP - it + 1
+      call read_abs(0,absorb_xmin_crust_mantle,reclen_xmin_crust_mantle,NSTEP-it+1)
+    endif
+
+    if ( .NOT. GPU_MODE) then
+      ! on CPU
+      do ispec2D=1,nspec2D_xmin_crust_mantle
+
+        ispec=ibelm_xmin_crust_mantle(ispec2D)
+
+        ! exclude elements that are not on absorbing edges
+        if(nkmin_xi_crust_mantle(1,ispec2D) == 0 .or. njmin_crust_mantle(1,ispec2D) == 0) cycle
+
+        i=1
+        do k=nkmin_xi_crust_mantle(1,ispec2D),NGLLZ
+          do j=njmin_crust_mantle(1,ispec2D),njmax_crust_mantle(1,ispec2D)
+            iglob=ibool_crust_mantle(i,j,k,ispec)
+
+            b_accel_crust_mantle(:,iglob)=b_accel_crust_mantle(:,iglob) - absorb_xmin_crust_mantle(:,j,k,ispec2D)
+          enddo
+        enddo
+      enddo
+
+    else
+      ! on GPU
+      if( nspec2D_xmin_crust_mantle > 0 ) call compute_stacey_elastic_backward_cuda(Mesh_pointer, &
+                                                                absorb_xmin_crust_mantle, &
+                                                                0) ! <= xmin
+    endif
+
+  endif ! NCHUNKS_VAL == 1 .or. ichunk == CHUNK_AC
+
+  !   xmax
+  ! if two chunks exclude this face for one of them
+  if(NCHUNKS_VAL == 1 .or. ichunk == CHUNK_AB) then
+
+    ! reads absorbing boundary values
+    if( nspec2D_xmax_crust_mantle > 0 ) then
+      call read_abs(1,absorb_xmax_crust_mantle,reclen_xmax_crust_mantle,NSTEP-it+1)
+    endif
+
+    if(.NOT. GPU_MODE ) then
+      ! on CPU
+      do ispec2D=1,nspec2D_xmax_crust_mantle
+
+        ispec=ibelm_xmax_crust_mantle(ispec2D)
+
+        ! exclude elements that are not on absorbing edges
+        if(nkmin_xi_crust_mantle(2,ispec2D) == 0 .or. njmin_crust_mantle(2,ispec2D) == 0) cycle
+
+        i=NGLLX
+        do k=nkmin_xi_crust_mantle(2,ispec2D),NGLLZ
+          do j=njmin_crust_mantle(2,ispec2D),njmax_crust_mantle(2,ispec2D)
+            iglob=ibool_crust_mantle(i,j,k,ispec)
+
+            b_accel_crust_mantle(:,iglob)=b_accel_crust_mantle(:,iglob) - absorb_xmax_crust_mantle(:,j,k,ispec2D)
+          enddo
+        enddo
+      enddo
+
+    else
+      ! on GPU
+      if( nspec2D_xmax_crust_mantle > 0 ) call compute_stacey_elastic_backward_cuda(Mesh_pointer, &
+                                                                absorb_xmax_crust_mantle, &
+                                                                1) ! <= xmin
+    endif
+
+  endif ! NCHUNKS_VAL == 1 .or. ichunk == CHUNK_AB
+
+  !   ymin
+
+  ! reads absorbing boundary values
+  if( nspec2D_ymin_crust_mantle > 0 ) then
+    call read_abs(2,absorb_ymin_crust_mantle, reclen_ymin_crust_mantle,NSTEP-it+1)
+  endif
+
+  if( .NOT. GPU_MODE ) then
+    ! on CPU
+    do ispec2D=1,nspec2D_ymin_crust_mantle
+
+      ispec=ibelm_ymin_crust_mantle(ispec2D)
+
+      ! exclude elements that are not on absorbing edges
+      if(nkmin_eta_crust_mantle(1,ispec2D) == 0 .or. nimin_crust_mantle(1,ispec2D) == 0) cycle
+
+      j=1
+      do k=nkmin_eta_crust_mantle(1,ispec2D),NGLLZ
+        do i=nimin_crust_mantle(1,ispec2D),nimax_crust_mantle(1,ispec2D)
+          iglob=ibool_crust_mantle(i,j,k,ispec)
+
+          b_accel_crust_mantle(:,iglob)=b_accel_crust_mantle(:,iglob) - absorb_ymin_crust_mantle(:,i,k,ispec2D)
+        enddo
+      enddo
+    enddo
+
+  else
+    ! on GPU
+    if( nspec2D_ymin_crust_mantle > 0 ) call compute_stacey_elastic_backward_cuda(Mesh_pointer, &
+                                                              absorb_ymin_crust_mantle, &
+                                                              2) ! <= ymin
+  endif
+
+  !   ymax
+
+  ! reads absorbing boundary values
+  if( nspec2D_ymax_crust_mantle > 0 ) then
+    call read_abs(3,absorb_ymax_crust_mantle,reclen_ymax_crust_mantle,NSTEP-it+1)
+  endif
+
+  if( .NOT. GPU_MODE ) then
+    ! on CPU
+    do ispec2D=1,nspec2D_ymax_crust_mantle
+
+      ispec=ibelm_ymax_crust_mantle(ispec2D)
+
+      ! exclude elements that are not on absorbing edges
+      if(nkmin_eta_crust_mantle(2,ispec2D) == 0 .or. nimin_crust_mantle(2,ispec2D) == 0) cycle
+
+      j=NGLLY
+      do k=nkmin_eta_crust_mantle(2,ispec2D),NGLLZ
+        do i=nimin_crust_mantle(2,ispec2D),nimax_crust_mantle(2,ispec2D)
+          iglob=ibool_crust_mantle(i,j,k,ispec)
+
+          b_accel_crust_mantle(:,iglob)=b_accel_crust_mantle(:,iglob) - absorb_ymax_crust_mantle(:,i,k,ispec2D)
+        enddo
+      enddo
+    enddo
+
+  else
+    ! on GPU
+    if( nspec2D_ymax_crust_mantle > 0 ) call compute_stacey_elastic_backward_cuda(Mesh_pointer, &
+                                                              absorb_ymax_crust_mantle, &
+                                                              3) ! <= ymax
+  endif
+
+  end subroutine compute_stacey_crust_mantle_backward
+
+

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_stacey_outer_core.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_stacey_outer_core.f90	2013-08-30 00:30:39 UTC (rev 22745)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_stacey_outer_core.f90	2013-08-30 12:45:44 UTC (rev 22746)
@@ -25,18 +25,18 @@
 !
 !=====================================================================
 
-  subroutine compute_stacey_outer_core()
+  subroutine compute_stacey_outer_core_forward()
 
   use constants_solver
 
   use specfem_par,only: &
-    ichunk,SIMULATION_TYPE,SAVE_FORWARD,NSTEP,it, &
+    ichunk,SIMULATION_TYPE,SAVE_FORWARD,it, &
     wgllwgll_xz,wgllwgll_yz,wgllwgll_xy
 
   use specfem_par,only: GPU_MODE,Mesh_pointer
 
   use specfem_par_outercore,only: &
-    veloc_outer_core,accel_outer_core,b_accel_outer_core, &
+    veloc_outer_core,accel_outer_core, &
     ibool_outer_core, &
     jacobian2D_xmin_outer_core,jacobian2D_xmax_outer_core, &
     jacobian2D_ymin_outer_core,jacobian2D_ymax_outer_core, &
@@ -69,18 +69,15 @@
   !           file access (by process rank modulo 8) showed that the following,
   !           simple approach is still fastest. (assuming that files are accessed on a local scratch disk)
 
+  ! checks
+  if( SIMULATION_TYPE == 3 ) return
+
   ! outer core
+
   !   xmin
   ! if two chunks exclude this face for one of them
   if(NCHUNKS_VAL == 1 .or. ichunk == CHUNK_AC) then
 
-    ! reads absorbing boundary values
-    if (SIMULATION_TYPE == 3 .and. nspec2D_xmin_outer_core > 0)  then
-      ! note: backward/reconstructed wavefields are read in after the Newmark time scheme in the first time loop
-      !          this leads to a corresponding boundary condition at time index NSTEP - (it-1) = NSTEP - it + 1
-      call read_abs(4,absorb_xmin_outer_core,reclen_xmin_outer_core,NSTEP-it+1)
-    endif
-
     if( .NOT. GPU_MODE) then
       ! on CPU
       do ispec2D=1,nspec2D_xmin_outer_core
@@ -101,9 +98,7 @@
 
             accel_outer_core(iglob) = accel_outer_core(iglob) - weight*sn
 
-            if (SIMULATION_TYPE == 3) then
-              b_accel_outer_core(iglob) = b_accel_outer_core(iglob) - absorb_xmin_outer_core(j,k,ispec2D)
-            else if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD) then
+            if( SAVE_FORWARD ) then
               absorb_xmin_outer_core(j,k,ispec2D) = weight*sn
             endif
           enddo
@@ -118,7 +113,7 @@
     endif
 
     ! writes absorbing boundary values
-    if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD .and. nspec2D_xmin_outer_core > 0 ) then
+    if( SAVE_FORWARD .and. nspec2D_xmin_outer_core > 0 ) then
       call write_abs(4,absorb_xmin_outer_core,reclen_xmin_outer_core,it)
     endif
 
@@ -128,10 +123,6 @@
   ! if two chunks exclude this face for one of them
   if(NCHUNKS_VAL == 1 .or. ichunk == CHUNK_AB) then
 
-    if (SIMULATION_TYPE == 3 .and. nspec2D_xmax_outer_core > 0)  then
-      call read_abs(5,absorb_xmax_outer_core,reclen_xmax_outer_core,NSTEP-it+1)
-    endif
-
     if( .NOT. GPU_MODE ) then
       ! on CPU
       do ispec2D=1,nspec2D_xmax_outer_core
@@ -152,9 +143,7 @@
 
             accel_outer_core(iglob) = accel_outer_core(iglob) - weight*sn
 
-            if (SIMULATION_TYPE == 3) then
-              b_accel_outer_core(iglob) = b_accel_outer_core(iglob) - absorb_xmax_outer_core(j,k,ispec2D)
-            else if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD) then
+            if( SAVE_FORWARD ) then
               absorb_xmax_outer_core(j,k,ispec2D) = weight*sn
             endif
 
@@ -169,16 +158,13 @@
                                                                 5) ! <= xmax
     endif
 
-    if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD .and. nspec2D_xmax_outer_core > 0 ) then
+    if( SAVE_FORWARD .and. nspec2D_xmax_outer_core > 0 ) then
       call write_abs(5,absorb_xmax_outer_core,reclen_xmax_outer_core,it)
     endif
 
   endif
 
   !   ymin
-  if (SIMULATION_TYPE == 3 .and. nspec2D_ymin_outer_core > 0)  then
-    call read_abs(6,absorb_ymin_outer_core,reclen_ymin_outer_core,NSTEP-it+1)
-  endif
 
   if( .NOT. GPU_MODE ) then
     ! on CPU
@@ -200,9 +186,7 @@
 
           accel_outer_core(iglob) = accel_outer_core(iglob) - weight*sn
 
-          if (SIMULATION_TYPE == 3) then
-            b_accel_outer_core(iglob) = b_accel_outer_core(iglob) - absorb_ymin_outer_core(i,k,ispec2D)
-          else if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD) then
+          if( SAVE_FORWARD ) then
             absorb_ymin_outer_core(i,k,ispec2D) = weight*sn
           endif
 
@@ -217,16 +201,12 @@
                                                               6) ! <= ymin
   endif
 
-  if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD .and. nspec2D_ymin_outer_core > 0 ) then
+  if( SAVE_FORWARD .and. nspec2D_ymin_outer_core > 0 ) then
     call write_abs(6,absorb_ymin_outer_core,reclen_ymin_outer_core,it)
   endif
 
   !   ymax
 
-  if (SIMULATION_TYPE == 3 .and. nspec2D_ymax_outer_core > 0)  then
-    call read_abs(7,absorb_ymax_outer_core,reclen_ymax_outer_core,NSTEP-it+1)
-  endif
-
   if( .NOT. GPU_MODE ) then
     ! on CPU
     do ispec2D=1,nspec2D_ymax_outer_core
@@ -247,9 +227,7 @@
 
           accel_outer_core(iglob) = accel_outer_core(iglob) - weight*sn
 
-          if (SIMULATION_TYPE == 3) then
-            b_accel_outer_core(iglob) = b_accel_outer_core(iglob) - absorb_ymax_outer_core(i,k,ispec2D)
-          else if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD) then
+          if( SAVE_FORWARD ) then
             absorb_ymax_outer_core(i,k,ispec2D) = weight*sn
           endif
 
@@ -264,16 +242,13 @@
                                                               7) ! <= ymax
   endif
 
-  if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD .and. nspec2D_ymax_outer_core > 0 ) then
+  if( SAVE_FORWARD .and. nspec2D_ymax_outer_core > 0 ) then
     call write_abs(7,absorb_ymax_outer_core,reclen_ymax_outer_core,it)
   endif
 
   ! zmin
 
   ! for surface elements exactly on the ICB
-  if (SIMULATION_TYPE == 3 .and. nspec2D_zmin_outer_core > 0)  then
-    call read_abs(8,absorb_zmin_outer_core,reclen_zmin,NSTEP-it+1)
-  endif
 
   if( .NOT. GPU_MODE ) then
     ! on CPU
@@ -292,9 +267,7 @@
 
           accel_outer_core(iglob) = accel_outer_core(iglob) - weight*sn
 
-          if (SIMULATION_TYPE == 3) then
-            b_accel_outer_core(iglob) = b_accel_outer_core(iglob) - absorb_zmin_outer_core(i,j,ispec2D)
-          else if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD) then
+          if( SAVE_FORWARD ) then
             absorb_zmin_outer_core(i,j,ispec2D) = weight*sn
           endif
 
@@ -309,8 +282,228 @@
                                                               8) ! <= zmin
   endif
 
-  if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD .and. nspec2D_zmin_outer_core > 0 ) then
+  if( SAVE_FORWARD .and. nspec2D_zmin_outer_core > 0 ) then
     call write_abs(8,absorb_zmin_outer_core,reclen_zmin,it)
   endif
 
-  end subroutine compute_stacey_outer_core
+  end subroutine compute_stacey_outer_core_forward
+
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+
+  subroutine compute_stacey_outer_core_backward()
+
+  use constants_solver
+
+  use specfem_par,only: &
+    ichunk,SIMULATION_TYPE,NSTEP,it
+
+  use specfem_par,only: GPU_MODE,Mesh_pointer
+
+  use specfem_par_outercore,only: &
+    b_accel_outer_core, &
+    ibool_outer_core, &
+    nspec2D_xmin_outer_core,nspec2D_xmax_outer_core, &
+    nspec2D_ymin_outer_core,nspec2D_ymax_outer_core,nspec2D_zmin_outer_core, &
+    nimin_outer_core,nimax_outer_core,nkmin_eta_outer_core, &
+    njmin_outer_core,njmax_outer_core,nkmin_xi_outer_core, &
+    absorb_xmin_outer_core,absorb_xmax_outer_core, &
+    absorb_ymin_outer_core,absorb_ymax_outer_core, &
+    absorb_zmin_outer_core, &
+    reclen_xmin_outer_core,reclen_xmax_outer_core, &
+    reclen_ymin_outer_core,reclen_ymax_outer_core, &
+    reclen_zmin, &
+    ibelm_xmin_outer_core,ibelm_xmax_outer_core, &
+    ibelm_ymin_outer_core,ibelm_ymax_outer_core, &
+    ibelm_bottom_outer_core
+  implicit none
+
+  ! local parameters
+  integer :: i,j,k,ispec2D,ispec,iglob
+
+  ! note: we use C functions for I/O as they still have a better performance than
+  !           Fortran, unformatted file I/O. however, using -assume byterecl together with Fortran functions
+  !           comes very close (only  ~ 4 % slower ).
+  !
+  !           tests with intermediate storages (every 8 step) and/or asynchronious
+  !           file access (by process rank modulo 8) showed that the following,
+  !           simple approach is still fastest. (assuming that files are accessed on a local scratch disk)
+
+  ! checks
+  if (SIMULATION_TYPE /= 3 ) return
+
+  ! outer core
+
+  !   xmin
+  ! if two chunks exclude this face for one of them
+  if(NCHUNKS_VAL == 1 .or. ichunk == CHUNK_AC) then
+
+    ! reads absorbing boundary values
+    if( nspec2D_xmin_outer_core > 0 ) then
+      ! note: backward/reconstructed wavefields are read in after the Newmark time scheme in the first time loop
+      !          this leads to a corresponding boundary condition at time index NSTEP - (it-1) = NSTEP - it + 1
+      call read_abs(4,absorb_xmin_outer_core,reclen_xmin_outer_core,NSTEP-it+1)
+    endif
+
+    if( .NOT. GPU_MODE) then
+      ! on CPU
+      do ispec2D=1,nspec2D_xmin_outer_core
+
+        ispec=ibelm_xmin_outer_core(ispec2D)
+
+        ! exclude elements that are not on absorbing edges
+        if(nkmin_xi_outer_core(1,ispec2D) == 0 .or. njmin_outer_core(1,ispec2D) == 0) cycle
+
+        i=1
+        do k=nkmin_xi_outer_core(1,ispec2D),NGLLZ
+          do j=njmin_outer_core(1,ispec2D),njmax_outer_core(1,ispec2D)
+            iglob=ibool_outer_core(i,j,k,ispec)
+
+            b_accel_outer_core(iglob) = b_accel_outer_core(iglob) - absorb_xmin_outer_core(j,k,ispec2D)
+          enddo
+        enddo
+      enddo
+
+    else
+      ! on GPU
+      if( nspec2D_xmin_outer_core > 0 ) call compute_stacey_acoustic_backward_cuda(Mesh_pointer, &
+                                                                absorb_xmin_outer_core, &
+                                                                4) ! <= xmin
+    endif
+
+  endif
+
+  !   xmax
+  ! if two chunks exclude this face for one of them
+  if(NCHUNKS_VAL == 1 .or. ichunk == CHUNK_AB) then
+
+    if( nspec2D_xmax_outer_core > 0 ) then
+      call read_abs(5,absorb_xmax_outer_core,reclen_xmax_outer_core,NSTEP-it+1)
+    endif
+
+    if( .NOT. GPU_MODE ) then
+      ! on CPU
+      do ispec2D=1,nspec2D_xmax_outer_core
+
+        ispec=ibelm_xmax_outer_core(ispec2D)
+
+        ! exclude elements that are not on absorbing edges
+        if(nkmin_xi_outer_core(2,ispec2D) == 0 .or. njmin_outer_core(2,ispec2D) == 0) cycle
+
+        i=NGLLX
+        do k=nkmin_xi_outer_core(2,ispec2D),NGLLZ
+          do j=njmin_outer_core(2,ispec2D),njmax_outer_core(2,ispec2D)
+            iglob=ibool_outer_core(i,j,k,ispec)
+
+            b_accel_outer_core(iglob) = b_accel_outer_core(iglob) - absorb_xmax_outer_core(j,k,ispec2D)
+          enddo
+        enddo
+      enddo
+
+    else
+      ! on GPU
+      if( nspec2D_xmax_outer_core > 0 ) call compute_stacey_acoustic_backward_cuda(Mesh_pointer, &
+                                                                absorb_xmax_outer_core, &
+                                                                5) ! <= xmax
+    endif
+
+  endif
+
+  !   ymin
+  if( nspec2D_ymin_outer_core > 0 ) then
+    call read_abs(6,absorb_ymin_outer_core,reclen_ymin_outer_core,NSTEP-it+1)
+  endif
+
+  if( .NOT. GPU_MODE ) then
+    ! on CPU
+    do ispec2D=1,nspec2D_ymin_outer_core
+
+      ispec=ibelm_ymin_outer_core(ispec2D)
+
+      ! exclude elements that are not on absorbing edges
+      if(nkmin_eta_outer_core(1,ispec2D) == 0 .or. nimin_outer_core(1,ispec2D) == 0) cycle
+
+      j=1
+      do k=nkmin_eta_outer_core(1,ispec2D),NGLLZ
+        do i=nimin_outer_core(1,ispec2D),nimax_outer_core(1,ispec2D)
+          iglob=ibool_outer_core(i,j,k,ispec)
+
+          b_accel_outer_core(iglob) = b_accel_outer_core(iglob) - absorb_ymin_outer_core(i,k,ispec2D)
+        enddo
+      enddo
+    enddo
+
+  else
+    ! on GPU
+    if( nspec2D_ymin_outer_core > 0 ) call compute_stacey_acoustic_backward_cuda(Mesh_pointer, &
+                                                              absorb_ymin_outer_core, &
+                                                              6) ! <= ymin
+  endif
+
+  !   ymax
+
+  if(  nspec2D_ymax_outer_core > 0 ) then
+    call read_abs(7,absorb_ymax_outer_core,reclen_ymax_outer_core,NSTEP-it+1)
+  endif
+
+  if( .NOT. GPU_MODE ) then
+    ! on CPU
+    do ispec2D=1,nspec2D_ymax_outer_core
+
+      ispec=ibelm_ymax_outer_core(ispec2D)
+
+      ! exclude elements that are not on absorbing edges
+      if(nkmin_eta_outer_core(2,ispec2D) == 0 .or. nimin_outer_core(2,ispec2D) == 0) cycle
+
+      j=NGLLY
+      do k=nkmin_eta_outer_core(2,ispec2D),NGLLZ
+        do i=nimin_outer_core(2,ispec2D),nimax_outer_core(2,ispec2D)
+          iglob=ibool_outer_core(i,j,k,ispec)
+
+          b_accel_outer_core(iglob) = b_accel_outer_core(iglob) - absorb_ymax_outer_core(i,k,ispec2D)
+        enddo
+      enddo
+    enddo
+
+  else
+    ! on GPU
+    if( nspec2D_ymax_outer_core > 0 ) call compute_stacey_acoustic_backward_cuda(Mesh_pointer, &
+                                                              absorb_ymax_outer_core, &
+                                                              7) ! <= ymax
+  endif
+
+  ! zmin
+
+  ! for surface elements exactly on the ICB
+  if( nspec2D_zmin_outer_core > 0 ) then
+    call read_abs(8,absorb_zmin_outer_core,reclen_zmin,NSTEP-it+1)
+  endif
+
+  if( .NOT. GPU_MODE ) then
+    ! on CPU
+    do ispec2D = 1,nspec2D_zmin_outer_core
+
+      ispec = ibelm_bottom_outer_core(ispec2D)
+
+      k = 1
+      do j = 1,NGLLY
+        do i = 1,NGLLX
+          iglob = ibool_outer_core(i,j,k,ispec)
+
+          b_accel_outer_core(iglob) = b_accel_outer_core(iglob) - absorb_zmin_outer_core(i,j,ispec2D)
+        enddo
+      enddo
+    enddo
+
+  else
+    ! on GPU
+    if( nspec2D_zmin_outer_core > 0 ) call compute_stacey_acoustic_backward_cuda(Mesh_pointer, &
+                                                              absorb_zmin_outer_core, &
+                                                              8) ! <= zmin
+  endif
+
+  end subroutine compute_stacey_outer_core_backward
+

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/finalize_simulation.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/finalize_simulation.f90	2013-08-30 00:30:39 UTC (rev 22745)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/finalize_simulation.f90	2013-08-30 12:45:44 UTC (rev 22746)
@@ -109,7 +109,11 @@
   ! dump kernel arrays
   if (SIMULATION_TYPE == 3) then
     ! crust mantle
-    call save_kernels_crust_mantle()
+    if (SAVE_REGULAR_KL) then
+      call save_regular_kernels_crust_mantle()
+    else
+      call save_kernels_crust_mantle()
+    endif
 
     ! noise strength kernel
     if (NOISE_TOMOGRAPHY == 3) then

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/iterate_time.F90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/iterate_time.F90	2013-08-30 00:30:39 UTC (rev 22745)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/iterate_time.F90	2013-08-30 12:45:44 UTC (rev 22746)
@@ -86,12 +86,30 @@
     ! elastic solver for crust/mantle and inner core
     call compute_forces_viscoelastic()
 
-    ! restores last time snapshot saved for backward/reconstruction of wavefields
-    ! note: this is done here after the Newmark time scheme, otherwise the indexing for sources
-    !          and adjoint sources will become more complicated
-    !          that is, index it for adjoint sources will match index NSTEP - 1 for backward/reconstructed wavefields
-    if( SIMULATION_TYPE == 3 .and. it == 1 ) then
-      call read_forward_arrays()
+    ! kernel simulations (forward and adjoint wavefields)
+    if( SIMULATION_TYPE == 3 ) then
+      ! reconstructs forward wavefields based on last store wavefield data
+
+      ! update displacement using Newmark time scheme
+      call update_displacement_Newmark_backward()
+
+      ! acoustic solver for outer core
+      ! (needs to be done first, before elastic one)
+      call compute_forces_acoustic_backward()
+
+      ! elastic solver for crust/mantle and inner core
+      call compute_forces_viscoelastic_backward()
+
+      ! restores last time snapshot saved for backward/reconstruction of wavefields
+      ! note: this is done here after the Newmark time scheme, otherwise the indexing for sources
+      !          and adjoint sources will become more complicated
+      !          that is, index it for adjoint sources will match index NSTEP - 1 for backward/reconstructed wavefields
+      if( it == 1 ) then
+        call read_forward_arrays()
+      endif
+
+      ! adjoint simulations: kernels
+      call compute_kernels()
     endif
 
     ! write the seismograms with time shift
@@ -99,11 +117,6 @@
       call write_seismograms()
     endif
 
-    ! adjoint simulations: kernels
-    if( SIMULATION_TYPE == 3 ) then
-      call compute_kernels()
-    endif
-
     ! outputs movie files
     call write_movie_output()
 

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/locate_regular_points.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/locate_regular_points.f90	2013-08-30 00:30:39 UTC (rev 22745)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/locate_regular_points.f90	2013-08-30 12:45:44 UTC (rev 22746)
@@ -25,7 +25,7 @@
 !
 !=====================================================================
 
-subroutine read_kl_regular_grid(myrank, GRID)
+  subroutine read_kl_regular_grid(myrank, GRID)
 
   use constants
 
@@ -82,11 +82,11 @@
     call exit_MPI(myrank, 'No Model points read in')
   endif
 
-end subroutine read_kl_regular_grid
+  end subroutine read_kl_regular_grid
 
 !==============================================================
 
-subroutine find_regular_grid_slice_number(slice_number, GRID, &
+  subroutine find_regular_grid_slice_number(slice_number, GRID, &
                                           NCHUNKS, NPROC_XI, NPROC_ETA)
 
   use constants
@@ -150,15 +150,15 @@
     slice_number(isp) = nproc * nproc * (chunk_isp-1) + nproc * iproc_eta + iproc_xi
   enddo
 
-end subroutine find_regular_grid_slice_number
+  end subroutine find_regular_grid_slice_number
 
 !==============================================================
 
 ! how about using single precision for the iterations?
-subroutine locate_regular_points(npoints_slice,points_slice,GRID, &
-                             NEX_XI,nspec,xstore,ystore,zstore,ibool, &
-                             xigll,yigll,zigll,ispec_reg, &
-                             hxir_reg,hetar_reg,hgammar_reg)
+  subroutine locate_regular_points(npoints_slice,points_slice,GRID, &
+                                   NEX_XI,nspec,xstore,ystore,zstore,ibool, &
+                                   xigll,yigll,zigll,ispec_reg, &
+                                   hxir_reg,hetar_reg,hgammar_reg)
 
   use constants_solver
 
@@ -357,11 +357,11 @@
 ! DEBUG
 !  print *, 'Maximum distance discrepancy ', maxval(dist_final(1:npoints_slice))
 
-end subroutine locate_regular_points
+  end subroutine locate_regular_points
 
 !==============================================================
 
-subroutine hex_nodes2(iaddx,iaddy,iaddz)
+  subroutine hex_nodes2(iaddx,iaddy,iaddz)
 
   use constants
 
@@ -408,11 +408,11 @@
 
   enddo
 
-end subroutine hex_nodes2
+  end subroutine hex_nodes2
 
 !==============================================================
 
-subroutine lagrange_any2(xi,NGLL,xigll,h)
+  subroutine lagrange_any2(xi,NGLL,xigll,h)
 
 ! subroutine to compute the Lagrange interpolants based upon the GLL points
 ! and their first derivatives at any point xi in [-1,1]
@@ -441,11 +441,11 @@
      h(dgr) = prod1 / prod2
   enddo
 
-end subroutine lagrange_any2
+  end subroutine lagrange_any2
 
 !==============================================================
 
-subroutine chunk_map(k,xx,yy,zz,xi,eta)
+  subroutine chunk_map(k,xx,yy,zz,xi,eta)
 
   ! this program get the xi,eta for (xx,yy,zz)
   ! point under the k'th chunk coordinate
@@ -492,5 +492,5 @@
      stop 'chunk number k < 6'
   endif
 
-end subroutine chunk_map
+  end subroutine chunk_map
 

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/locate_sources.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/locate_sources.f90	2013-08-30 00:30:39 UTC (rev 22745)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/locate_sources.f90	2013-08-30 12:45:44 UTC (rev 22746)
@@ -196,23 +196,41 @@
 
     ! allocate arrays specific to each subset
     allocate(final_distance_source_subset(NSOURCES_SUBSET_current_size), &
-            ispec_selected_source_subset(NSOURCES_SUBSET_current_size), &
-            ispec_selected_source_all(NSOURCES_SUBSET_current_size,0:NPROCTOT_VAL-1), &
-            xi_source_all(NSOURCES_SUBSET_current_size,0:NPROCTOT_VAL-1), &
-            eta_source_all(NSOURCES_SUBSET_current_size,0:NPROCTOT_VAL-1), &
-            gamma_source_all(NSOURCES_SUBSET_current_size,0:NPROCTOT_VAL-1), &
-            final_distance_source_all(NSOURCES_SUBSET_current_size,0:NPROCTOT_VAL-1), &
-            x_found_source_all(NSOURCES_SUBSET_current_size,0:NPROCTOT_VAL-1), &
-            y_found_source_all(NSOURCES_SUBSET_current_size,0:NPROCTOT_VAL-1), &
-            z_found_source_all(NSOURCES_SUBSET_current_size,0:NPROCTOT_VAL-1), &
-            xi_source_subset(NSOURCES_SUBSET_current_size), &
-            eta_source_subset(NSOURCES_SUBSET_current_size), &
-            gamma_source_subset(NSOURCES_SUBSET_current_size), &
-            x_found_source(NSOURCES_SUBSET_current_size), &
-            y_found_source(NSOURCES_SUBSET_current_size), &
-            z_found_source(NSOURCES_SUBSET_current_size),stat=ier)
+             ispec_selected_source_subset(NSOURCES_SUBSET_current_size), &
+             xi_source_subset(NSOURCES_SUBSET_current_size), &
+             eta_source_subset(NSOURCES_SUBSET_current_size), &
+             gamma_source_subset(NSOURCES_SUBSET_current_size), &
+             x_found_source(NSOURCES_SUBSET_current_size), &
+             y_found_source(NSOURCES_SUBSET_current_size), &
+             z_found_source(NSOURCES_SUBSET_current_size),stat=ier)
     if( ier /= 0 ) call exit_MPI(myrank,'error allocating temporary source arrays')
 
+    ! arrays to collect data
+    if( myrank == 0 ) then
+      allocate(ispec_selected_source_all(NSOURCES_SUBSET_current_size,0:NPROCTOT_VAL-1), &
+               xi_source_all(NSOURCES_SUBSET_current_size,0:NPROCTOT_VAL-1), &
+               eta_source_all(NSOURCES_SUBSET_current_size,0:NPROCTOT_VAL-1), &
+               gamma_source_all(NSOURCES_SUBSET_current_size,0:NPROCTOT_VAL-1), &
+               final_distance_source_all(NSOURCES_SUBSET_current_size,0:NPROCTOT_VAL-1), &
+               x_found_source_all(NSOURCES_SUBSET_current_size,0:NPROCTOT_VAL-1), &
+               y_found_source_all(NSOURCES_SUBSET_current_size,0:NPROCTOT_VAL-1), &
+               z_found_source_all(NSOURCES_SUBSET_current_size,0:NPROCTOT_VAL-1),stat=ier)
+      if( ier /= 0 ) call exit_MPI(myrank,'error allocating temporary source arrays for gather')
+    else
+      ! dummy arrays
+      allocate(ispec_selected_source_all(1,1), &
+               xi_source_all(1,1), &
+               eta_source_all(1,1), &
+               gamma_source_all(1,1), &
+               final_distance_source_all(1,1), &
+               x_found_source_all(1,1), &
+               y_found_source_all(1,1), &
+               z_found_source_all(1,1),stat=ier)
+      if( ier /= 0 ) call exit_MPI(myrank,'error allocating temporary source dummy arrays for gather')
+    endif
+    ! use -1 as a flag to detect if gather fails for some reason
+    ispec_selected_source_all(:,:) = -1
+
     ! make sure we clean the subset array before the gather
     ispec_selected_source_subset(:) = 0
     final_distance_source_subset(:) = HUGEVAL
@@ -549,18 +567,22 @@
 
     ! end of loop on all the sources
     enddo
+    ! synchronizes processes
+    call sync_all()
 
     ! now gather information from all the nodes
-    ! use -1 as a flag to detect if gather fails for some reason
-    ispec_selected_source_all(:,:) = -1
-
     call gather_all_i(ispec_selected_source_subset,NSOURCES_SUBSET_current_size, &
       ispec_selected_source_all,NSOURCES_SUBSET_current_size,NPROCTOT_VAL)
 
+    ! daniel debug
+    !print*,'rank',myrank,'ispec:',ispec_selected_source_subset(:),'all:',ispec_selected_source_all(:,:)
+
     ! checks that the gather operation went well
     if(myrank == 0) then
-      if(minval(ispec_selected_source_all(:,:)) <= 0) &
+      if(minval(ispec_selected_source_all(:,:)) <= 0) then
+        print*,'error ispec all:',ispec_selected_source_all(:,:)
         call exit_MPI(myrank,'gather operation failed for source')
+      endif
     endif
 
     call gather_all_dp(xi_source_subset,NSOURCES_SUBSET_current_size, &
@@ -714,11 +736,11 @@
     ! deallocate arrays specific to each subset
     deallocate(final_distance_source_subset)
     deallocate(ispec_selected_source_subset)
+    deallocate(xi_source_subset,eta_source_subset,gamma_source_subset)
+    deallocate(x_found_source,y_found_source,z_found_source)
     deallocate(ispec_selected_source_all)
     deallocate(xi_source_all,eta_source_all,gamma_source_all,final_distance_source_all)
     deallocate(x_found_source_all,y_found_source_all,z_found_source_all)
-    deallocate(xi_source_subset,eta_source_subset,gamma_source_subset)
-    deallocate(x_found_source,y_found_source,z_found_source)
 
   enddo ! end of loop over all source subsets
 

Copied: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/read_mesh_databases.F90 (from rev 22745, seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/read_mesh_databases.f90)
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/read_mesh_databases.F90	                        (rev 0)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/read_mesh_databases.F90	2013-08-30 12:45:44 UTC (rev 22746)
@@ -0,0 +1,1241 @@
+!=====================================================================
+!
+!          S p e c f e m 3 D  G l o b e  V e r s i o n  6 . 0
+!          --------------------------------------------------
+!
+!          Main authors: Dimitri Komatitsch and Jeroen Tromp
+!                        Princeton University, USA
+!             and CNRS / INRIA / University of Pau, France
+! (c) Princeton University and CNRS / INRIA / University of Pau
+!                            August 2013
+!
+! This program is free software; you can redistribute it and/or modify
+! it under the terms of the GNU General Public License as published by
+! the Free Software Foundation; either version 2 of the License, or
+! (at your option) any later version.
+!
+! This program is distributed in the hope that it will be useful,
+! but WITHOUT ANY WARRANTY; without even the implied warranty of
+! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+! GNU General Public License for more details.
+!
+! You should have received a copy of the GNU General Public License along
+! with this program; if not, write to the Free Software Foundation, Inc.,
+! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+!
+!=====================================================================
+
+  subroutine read_mesh_databases()
+
+  use specfem_par
+  use specfem_par_crustmantle
+  use specfem_par_innercore
+  use specfem_par_outercore
+
+  implicit none
+
+  ! local parameters
+  ! timing
+  double precision, external :: wtime
+
+  ! get MPI starting time
+  time_start = wtime()
+
+#ifdef USE_SERIAL_CASCADE_FOR_IOs
+  ! serial i/o
+  you_can_start_doing_IOs = .false.
+  if (myrank > 0) call recv_singlel(you_can_start_doing_IOs,myrank-1,itag)
+#endif
+
+  ! start reading the databases
+  ! read arrays created by the mesher
+
+  ! reads "solver_data.bin" files for crust and mantle
+  call read_mesh_databases_CM()
+
+  ! reads "solver_data.bin" files for outer core
+  call read_mesh_databases_OC()
+
+  ! reads "solver_data.bin" files for inner core
+  call read_mesh_databases_IC()
+
+  ! reads "boundary.bin" files to couple mantle with outer core and inner core boundaries
+  if( ADIOS_ENABLED .and. ADIOS_FOR_ARRAYS_SOLVER ) then
+    call read_mesh_databases_coupling_adios()
+  else
+    call read_mesh_databases_coupling()
+  endif
+
+  ! reads "addressing.txt" 2-D addressing (needed for Stacey boundaries)
+  call read_mesh_databases_addressing()
+
+  ! sets up MPI interfaces, inner/outer elements and mesh coloring
+  call read_mesh_databases_MPI()
+
+  ! absorbing boundaries
+  if(ABSORBING_CONDITIONS) then
+    ! reads "stacey.bin" files
+    if( ADIOS_ENABLED .and. ADIOS_FOR_ARRAYS_SOLVER ) then
+      call read_mesh_databases_stacey_adios()
+    else
+      call read_mesh_databases_stacey()
+    endif
+  endif
+
+  ! kernels on regular grids
+  if (SAVE_REGULAR_KL) then
+    call read_mesh_databases_regular_kl()
+  endif
+
+#ifdef USE_SERIAL_CASCADE_FOR_IOs
+  ! serial i/o
+  you_can_start_doing_IOs = .true.
+  if (myrank < NPROC_XI_VAL*NPROC_ETA_VAL-1) call send_singlel(you_can_start_doing_IOs,myrank+1,itag)
+#endif
+
+  ! user output
+  call sync_all()
+  if( myrank == 0 ) then
+    ! elapsed time since beginning of mesh generation
+    tCPU = wtime() - time_start
+    write(IMAIN,*)
+    write(IMAIN,*) 'Elapsed time for reading mesh in seconds = ',sngl(tCPU)
+    write(IMAIN,*)
+    call flush_IMAIN()
+  endif
+
+  end subroutine read_mesh_databases
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+  subroutine read_mesh_databases_CM()
+
+! mesh for CRUST MANTLE region
+
+  use specfem_par
+  use specfem_par_crustmantle
+  implicit none
+
+  ! local parameters
+  integer :: nspec_iso,nspec_tiso,nspec_ani
+  logical :: READ_KAPPA_MU,READ_TISO
+  ! dummy array that does not need to be actually read
+  integer, dimension(:),allocatable :: dummy_idoubling
+  integer :: ier
+
+  ! crust and mantle
+
+  if(ANISOTROPIC_3D_MANTLE_VAL) then
+    READ_KAPPA_MU = .false.
+    READ_TISO = .false.
+    nspec_iso = NSPECMAX_ISO_MANTLE ! 1
+    nspec_tiso = NSPECMAX_TISO_MANTLE ! 1
+    nspec_ani = NSPEC_CRUST_MANTLE
+  else
+    READ_KAPPA_MU = .true.
+    nspec_iso = NSPEC_CRUST_MANTLE
+    if(TRANSVERSE_ISOTROPY_VAL) then
+      nspec_tiso = NSPECMAX_TISO_MANTLE
+    else
+      nspec_tiso = 1
+    endif
+    nspec_ani = NSPECMAX_ANISO_MANTLE ! 1
+    READ_TISO = .true.
+  endif
+
+  ! sets number of top elements for surface movies & noise tomography
+  NSPEC_TOP = NSPEC2D_TOP(IREGION_CRUST_MANTLE)
+
+  ! allocates mass matrices in this slice (will be fully assembled in the solver)
+  !
+  ! in the case of Stacey boundary conditions, add C*deltat/2 contribution to the mass matrix
+  ! on Stacey edges for the crust_mantle and outer_core regions but not for the inner_core region
+  ! thus the mass matrix must be replaced by three mass matrices including the "C" damping matrix
+  !
+  ! if absorbing_conditions are not set or if NCHUNKS=6, only one mass matrix is needed
+  ! for the sake of performance, only "rmassz" array will be filled and "rmassx" & "rmassy" will be obsolete
+
+  ! allocates dummy array
+  allocate(dummy_idoubling(NSPEC_CRUST_MANTLE),stat=ier)
+  if( ier /= 0 ) call exit_mpi(myrank,'error allocating dummy idoubling in crust_mantle')
+
+  ! allocates mass matrices
+  allocate(rmassx_crust_mantle(NGLOB_XY_CM), &
+           rmassy_crust_mantle(NGLOB_XY_CM),stat=ier)
+  if(ier /= 0) stop 'error allocating rmassx, rmassy in crust_mantle'
+
+  allocate(rmassz_crust_mantle(NGLOB_CRUST_MANTLE),stat=ier)
+  if(ier /= 0) stop 'error allocating rmassz in crust_mantle'
+
+  allocate(b_rmassx_crust_mantle(NGLOB_XY_CM), &
+           b_rmassy_crust_mantle(NGLOB_XY_CM),stat=ier)
+  if(ier /= 0) stop 'error allocating b_rmassx, b_rmassy in crust_mantle'
+
+  ! reads databases file
+  if( ADIOS_ENABLED .and. ADIOS_FOR_ARRAYS_SOLVER ) then
+    call read_arrays_solver_adios(IREGION_CRUST_MANTLE,myrank, &
+        NSPEC_CRUST_MANTLE,NGLOB_CRUST_MANTLE,NGLOB_XY_CM, &
+        nspec_iso,nspec_tiso,nspec_ani, &
+        rho_vp_crust_mantle,rho_vs_crust_mantle, &
+        xstore_crust_mantle,ystore_crust_mantle,zstore_crust_mantle, &
+        xix_crust_mantle,xiy_crust_mantle,xiz_crust_mantle, &
+        etax_crust_mantle,etay_crust_mantle,etaz_crust_mantle, &
+        gammax_crust_mantle,gammay_crust_mantle,gammaz_crust_mantle, &
+        rhostore_crust_mantle,kappavstore_crust_mantle,muvstore_crust_mantle, &
+        kappahstore_crust_mantle,muhstore_crust_mantle,eta_anisostore_crust_mantle, &
+        c11store_crust_mantle,c12store_crust_mantle,c13store_crust_mantle, &
+        c14store_crust_mantle,c15store_crust_mantle,c16store_crust_mantle, &
+        c22store_crust_mantle,c23store_crust_mantle,c24store_crust_mantle, &
+        c25store_crust_mantle,c26store_crust_mantle,c33store_crust_mantle, &
+        c34store_crust_mantle,c35store_crust_mantle,c36store_crust_mantle, &
+        c44store_crust_mantle,c45store_crust_mantle,c46store_crust_mantle, &
+        c55store_crust_mantle,c56store_crust_mantle,c66store_crust_mantle, &
+        ibool_crust_mantle,dummy_idoubling,ispec_is_tiso_crust_mantle, &
+        rmassx_crust_mantle,rmassy_crust_mantle,rmassz_crust_mantle,rmass_ocean_load, &
+        READ_KAPPA_MU,READ_TISO, &
+        b_rmassx_crust_mantle,b_rmassy_crust_mantle)
+  else
+    call read_arrays_solver(IREGION_CRUST_MANTLE,myrank, &
+        NSPEC_CRUST_MANTLE,NGLOB_CRUST_MANTLE,NGLOB_XY_CM, &
+        nspec_iso,nspec_tiso,nspec_ani, &
+        rho_vp_crust_mantle,rho_vs_crust_mantle, &
+        xstore_crust_mantle,ystore_crust_mantle,zstore_crust_mantle, &
+        xix_crust_mantle,xiy_crust_mantle,xiz_crust_mantle, &
+        etax_crust_mantle,etay_crust_mantle,etaz_crust_mantle, &
+        gammax_crust_mantle,gammay_crust_mantle,gammaz_crust_mantle, &
+        rhostore_crust_mantle,kappavstore_crust_mantle,muvstore_crust_mantle, &
+        kappahstore_crust_mantle,muhstore_crust_mantle,eta_anisostore_crust_mantle, &
+        c11store_crust_mantle,c12store_crust_mantle,c13store_crust_mantle, &
+        c14store_crust_mantle,c15store_crust_mantle,c16store_crust_mantle, &
+        c22store_crust_mantle,c23store_crust_mantle,c24store_crust_mantle, &
+        c25store_crust_mantle,c26store_crust_mantle,c33store_crust_mantle, &
+        c34store_crust_mantle,c35store_crust_mantle,c36store_crust_mantle, &
+        c44store_crust_mantle,c45store_crust_mantle,c46store_crust_mantle, &
+        c55store_crust_mantle,c56store_crust_mantle,c66store_crust_mantle, &
+        ibool_crust_mantle,dummy_idoubling,ispec_is_tiso_crust_mantle, &
+        rmassx_crust_mantle,rmassy_crust_mantle,rmassz_crust_mantle,rmass_ocean_load, &
+        READ_KAPPA_MU,READ_TISO, &
+        b_rmassx_crust_mantle,b_rmassy_crust_mantle)
+  endif
+
+  ! check that the number of points in this slice is correct
+  if(minval(ibool_crust_mantle(:,:,:,:)) /= 1) &
+      call exit_MPI(myrank,'incorrect global numbering: iboolmin is not equal to 1 in crust and mantle')
+  if(maxval(ibool_crust_mantle(:,:,:,:)) /= NGLOB_CRUST_MANTLE) &
+      call exit_MPI(myrank,'incorrect global numbering: iboolmax does not equal nglob in crust and mantle')
+
+  deallocate(dummy_idoubling)
+
+  end subroutine read_mesh_databases_CM
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+  subroutine read_mesh_databases_OC()
+
+! mesh for OUTER CORE region
+
+  use specfem_par
+  use specfem_par_outercore
+
+  implicit none
+
+  ! local parameters
+  integer :: nspec_iso,nspec_tiso,nspec_ani,NGLOB_XY_dummy
+  logical :: READ_KAPPA_MU,READ_TISO
+  integer :: ier
+
+  ! dummy array that does not need to be actually read
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,1) :: dummy_array
+  real(kind=CUSTOM_REAL), dimension(:), allocatable :: dummy_rmass
+
+  logical, dimension(:), allocatable :: dummy_ispec_is_tiso
+  integer, dimension(:), allocatable :: dummy_idoubling_outer_core
+
+  ! outer core (no anisotropy nor S velocity)
+  ! rmass_ocean_load is not used in this routine because it is meaningless in the outer core
+  READ_KAPPA_MU = .false.
+  READ_TISO = .false.
+  nspec_iso = NSPEC_OUTER_CORE
+  nspec_tiso = 1
+  nspec_ani = 1
+
+  ! dummy allocation
+  NGLOB_XY_dummy = 1
+
+  allocate(dummy_rmass(NGLOB_XY_dummy), &
+          dummy_ispec_is_tiso(NSPEC_OUTER_CORE), &
+          dummy_idoubling_outer_core(NSPEC_OUTER_CORE), &
+          stat=ier)
+  if(ier /= 0) stop 'error allocating dummy rmass and dummy ispec/idoubling in outer core'
+
+  ! allocates mass matrices in this slice (will be fully assembled in the solver)
+  !
+  ! in the case of Stacey boundary conditions, add C*deltat/2 contribution to the mass matrix
+  ! on Stacey edges for the crust_mantle and outer_core regions but not for the inner_core region
+  ! thus the mass matrix must be replaced by three mass matrices including the "C" damping matrix
+  !
+  ! if absorbing_conditions are not set or if NCHUNKS=6, only one mass matrix is needed
+  ! for the sake of performance, only "rmassz" array will be filled and "rmassx" & "rmassy" will be obsolete
+  allocate(rmass_outer_core(NGLOB_OUTER_CORE),stat=ier)
+  if(ier /= 0) stop 'error allocating rmass in outer core'
+
+  if( ADIOS_ENABLED .and. ADIOS_FOR_ARRAYS_SOLVER ) then
+    call read_arrays_solver_adios(IREGION_OUTER_CORE,myrank, &
+              NSPEC_OUTER_CORE,NGLOB_OUTER_CORE,NGLOB_XY_dummy, &
+              nspec_iso,nspec_tiso,nspec_ani, &
+              vp_outer_core,dummy_array, &
+              xstore_outer_core,ystore_outer_core,zstore_outer_core, &
+              xix_outer_core,xiy_outer_core,xiz_outer_core, &
+              etax_outer_core,etay_outer_core,etaz_outer_core, &
+              gammax_outer_core,gammay_outer_core,gammaz_outer_core, &
+              rhostore_outer_core,kappavstore_outer_core,dummy_array, &
+              dummy_array,dummy_array,dummy_array, &
+              dummy_array,dummy_array,dummy_array, &
+              dummy_array,dummy_array,dummy_array, &
+              dummy_array,dummy_array,dummy_array, &
+              dummy_array,dummy_array,dummy_array, &
+              dummy_array,dummy_array,dummy_array, &
+              dummy_array,dummy_array,dummy_array, &
+              dummy_array,dummy_array,dummy_array, &
+              ibool_outer_core,dummy_idoubling_outer_core,dummy_ispec_is_tiso, &
+              dummy_rmass,dummy_rmass,rmass_outer_core,rmass_ocean_load, &
+              READ_KAPPA_MU,READ_TISO, &
+              dummy_rmass,dummy_rmass)
+  else
+    call read_arrays_solver(IREGION_OUTER_CORE,myrank, &
+              NSPEC_OUTER_CORE,NGLOB_OUTER_CORE,NGLOB_XY_dummy, &
+              nspec_iso,nspec_tiso,nspec_ani, &
+              vp_outer_core,dummy_array, &
+              xstore_outer_core,ystore_outer_core,zstore_outer_core, &
+              xix_outer_core,xiy_outer_core,xiz_outer_core, &
+              etax_outer_core,etay_outer_core,etaz_outer_core, &
+              gammax_outer_core,gammay_outer_core,gammaz_outer_core, &
+              rhostore_outer_core,kappavstore_outer_core,dummy_array, &
+              dummy_array,dummy_array,dummy_array, &
+              dummy_array,dummy_array,dummy_array, &
+              dummy_array,dummy_array,dummy_array, &
+              dummy_array,dummy_array,dummy_array, &
+              dummy_array,dummy_array,dummy_array, &
+              dummy_array,dummy_array,dummy_array, &
+              dummy_array,dummy_array,dummy_array, &
+              dummy_array,dummy_array,dummy_array, &
+              ibool_outer_core,dummy_idoubling_outer_core,dummy_ispec_is_tiso, &
+              dummy_rmass,dummy_rmass,rmass_outer_core,rmass_ocean_load, &
+              READ_KAPPA_MU,READ_TISO, &
+              dummy_rmass,dummy_rmass)
+  endif
+
+  deallocate(dummy_idoubling_outer_core,dummy_ispec_is_tiso,dummy_rmass)
+
+  ! check that the number of points in this slice is correct
+  ! check that the number of points in this slice is correct
+  if(minval(ibool_outer_core(:,:,:,:)) /= 1) &
+      call exit_MPI(myrank,'incorrect global numbering: iboolmin is not equal to 1 in outer core')
+  if(maxval(ibool_outer_core(:,:,:,:)) /= NGLOB_OUTER_CORE) then
+    call exit_MPI(myrank, 'incorrect global numbering: &
+        & iboolmax does not equal nglob in outer core')
+  endif
+
+  end subroutine read_mesh_databases_OC
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+  subroutine read_mesh_databases_IC()
+
+! mesh for INNER CORE region
+
+  use specfem_par
+  use specfem_par_innercore
+  implicit none
+
+  ! local parameters
+  integer :: nspec_iso,nspec_tiso,nspec_ani
+  logical :: READ_KAPPA_MU,READ_TISO
+  integer :: ier
+
+  ! dummy array that does not need to be actually read
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,1) :: dummy_array
+  logical, dimension(:),allocatable:: dummy_ispec_is_tiso
+
+  ! inner core (no anisotropy)
+  ! rmass_ocean_load is not used in this routine because it is meaningless in the inner core
+  READ_KAPPA_MU = .true. ! (muvstore needed for attenuation)
+  READ_TISO = .false.
+  nspec_iso = NSPEC_INNER_CORE
+  nspec_tiso = 1
+  if(ANISOTROPIC_INNER_CORE_VAL) then
+    nspec_ani = NSPEC_INNER_CORE
+  else
+    nspec_ani = 1
+  endif
+
+  allocate(dummy_ispec_is_tiso(NSPEC_INNER_CORE), &
+           stat=ier)
+  if(ier /= 0) stop 'error allocating dummy ispec in inner core'
+
+  ! allocates mass matrices in this slice (will be fully assembled in the solver)
+  !
+  ! in the case of Stacey boundary conditions, add C*deltat/2 contribution to the mass matrix
+  ! on Stacey edges for the crust_mantle and outer_core regions but not for the inner_core region
+  ! thus the mass matrix must be replaced by three mass matrices including the "C" damping matrix
+  !
+  ! if absorbing_conditions are not set or if NCHUNKS=6, only one mass matrix is needed
+  ! for the sake of performance, only "rmassz" array will be filled and "rmassx" & "rmassy" will be obsolete
+  allocate(rmassx_inner_core(NGLOB_XY_IC), &
+           rmassy_inner_core(NGLOB_XY_IC),stat=ier)
+  if(ier /= 0) stop 'error allocating rmassx, rmassy in inner_core'
+
+  allocate(rmass_inner_core(NGLOB_INNER_CORE),stat=ier)
+  if(ier /= 0) stop 'error allocating rmass in inner core'
+
+  allocate(b_rmassx_inner_core(NGLOB_XY_IC), &
+           b_rmassy_inner_core(NGLOB_XY_IC),stat=ier)
+  if(ier /= 0) stop 'error allocating b_rmassx, b_rmassy in inner_core'
+
+  ! reads in arrays
+  if( ADIOS_ENABLED .and. ADIOS_FOR_ARRAYS_SOLVER ) then
+    call read_arrays_solver_adios(IREGION_INNER_CORE,myrank, &
+              NSPEC_INNER_CORE,NGLOB_INNER_CORE,NGLOB_XY_IC, &
+              nspec_iso,nspec_tiso,nspec_ani, &
+              dummy_array,dummy_array, &
+              xstore_inner_core,ystore_inner_core,zstore_inner_core, &
+              xix_inner_core,xiy_inner_core,xiz_inner_core, &
+              etax_inner_core,etay_inner_core,etaz_inner_core, &
+              gammax_inner_core,gammay_inner_core,gammaz_inner_core, &
+              rhostore_inner_core,kappavstore_inner_core,muvstore_inner_core, &
+              dummy_array,dummy_array,dummy_array, &
+              c11store_inner_core,c12store_inner_core,c13store_inner_core, &
+              dummy_array,dummy_array,dummy_array, &
+              dummy_array,dummy_array,dummy_array, &
+              dummy_array,dummy_array,c33store_inner_core, &
+              dummy_array,dummy_array,dummy_array, &
+              c44store_inner_core,dummy_array,dummy_array, &
+              dummy_array,dummy_array,dummy_array, &
+              ibool_inner_core,idoubling_inner_core,dummy_ispec_is_tiso, &
+              rmassx_inner_core,rmassy_inner_core,rmass_inner_core,rmass_ocean_load, &
+              READ_KAPPA_MU,READ_TISO, &
+              b_rmassx_inner_core,b_rmassy_inner_core)
+  else
+    call read_arrays_solver(IREGION_INNER_CORE,myrank, &
+              NSPEC_INNER_CORE,NGLOB_INNER_CORE,NGLOB_XY_IC, &
+              nspec_iso,nspec_tiso,nspec_ani, &
+              dummy_array,dummy_array, &
+              xstore_inner_core,ystore_inner_core,zstore_inner_core, &
+              xix_inner_core,xiy_inner_core,xiz_inner_core, &
+              etax_inner_core,etay_inner_core,etaz_inner_core, &
+              gammax_inner_core,gammay_inner_core,gammaz_inner_core, &
+              rhostore_inner_core,kappavstore_inner_core,muvstore_inner_core, &
+              dummy_array,dummy_array,dummy_array, &
+              c11store_inner_core,c12store_inner_core,c13store_inner_core, &
+              dummy_array,dummy_array,dummy_array, &
+              dummy_array,dummy_array,dummy_array, &
+              dummy_array,dummy_array,c33store_inner_core, &
+              dummy_array,dummy_array,dummy_array, &
+              c44store_inner_core,dummy_array,dummy_array, &
+              dummy_array,dummy_array,dummy_array, &
+              ibool_inner_core,idoubling_inner_core,dummy_ispec_is_tiso, &
+              rmassx_inner_core,rmassy_inner_core,rmass_inner_core,rmass_ocean_load, &
+              READ_KAPPA_MU,READ_TISO, &
+              b_rmassx_inner_core,b_rmassy_inner_core)
+  endif
+
+  deallocate(dummy_ispec_is_tiso)
+
+  ! check that the number of points in this slice is correct
+  if(minval(ibool_inner_core(:,:,:,:)) /= 1 .or. maxval(ibool_inner_core(:,:,:,:)) /= NGLOB_INNER_CORE) &
+    call exit_MPI(myrank,'incorrect global numbering: iboolmax does not equal nglob in inner core')
+
+  end subroutine read_mesh_databases_IC
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+  subroutine read_mesh_databases_coupling()
+
+! to couple mantle with outer core
+
+  use specfem_par
+  use specfem_par_crustmantle
+  use specfem_par_innercore
+  use specfem_par_outercore
+
+  implicit none
+
+  ! local parameters
+  integer :: njunk1,njunk2,njunk3
+  integer :: ier
+
+  ! user output
+  if( myrank == 0 ) write(IMAIN,*) 'reading coupling surfaces...'
+
+  ! crust and mantle
+  ! create name of database
+  call create_name_database(prname,myrank,IREGION_CRUST_MANTLE,LOCAL_PATH)
+
+  ! Stacey put back
+  open(unit=27,file=prname(1:len_trim(prname))//'boundary.bin', &
+        status='old',form='unformatted',action='read',iostat=ier)
+  if( ier /= 0 ) call exit_mpi(myrank,'error opening crust_mantle boundary.bin file')
+
+  read(27) nspec2D_xmin_crust_mantle
+  read(27) nspec2D_xmax_crust_mantle
+  read(27) nspec2D_ymin_crust_mantle
+  read(27) nspec2D_ymax_crust_mantle
+  read(27) njunk1
+  read(27) njunk2
+
+! boundary parameters
+  read(27) ibelm_xmin_crust_mantle
+  read(27) ibelm_xmax_crust_mantle
+  read(27) ibelm_ymin_crust_mantle
+  read(27) ibelm_ymax_crust_mantle
+  read(27) ibelm_bottom_crust_mantle
+  read(27) ibelm_top_crust_mantle
+
+  read(27) normal_xmin_crust_mantle
+  read(27) normal_xmax_crust_mantle
+  read(27) normal_ymin_crust_mantle
+  read(27) normal_ymax_crust_mantle
+  read(27) normal_bottom_crust_mantle
+  read(27) normal_top_crust_mantle
+
+  read(27) jacobian2D_xmin_crust_mantle
+  read(27) jacobian2D_xmax_crust_mantle
+  read(27) jacobian2D_ymin_crust_mantle
+  read(27) jacobian2D_ymax_crust_mantle
+  read(27) jacobian2D_bottom_crust_mantle
+  read(27) jacobian2D_top_crust_mantle
+  close(27)
+
+  ! read parameters to couple fluid and solid regions
+  !
+  ! outer core
+
+  ! create name of database
+  call create_name_database(prname,myrank,IREGION_OUTER_CORE,LOCAL_PATH)
+
+  ! boundary parameters
+
+  ! Stacey put back
+  open(unit=27,file=prname(1:len_trim(prname))//'boundary.bin', &
+        status='old',form='unformatted',action='read',iostat=ier)
+  if( ier /= 0 ) call exit_mpi(myrank,'error opening outer_core boundary.bin file')
+
+  read(27) nspec2D_xmin_outer_core
+  read(27) nspec2D_xmax_outer_core
+  read(27) nspec2D_ymin_outer_core
+  read(27) nspec2D_ymax_outer_core
+  read(27) njunk1
+  read(27) njunk2
+
+  nspec2D_zmin_outer_core = NSPEC2D_BOTTOM(IREGION_OUTER_CORE)
+
+  read(27) ibelm_xmin_outer_core
+  read(27) ibelm_xmax_outer_core
+  read(27) ibelm_ymin_outer_core
+  read(27) ibelm_ymax_outer_core
+  read(27) ibelm_bottom_outer_core
+  read(27) ibelm_top_outer_core
+
+  read(27) normal_xmin_outer_core
+  read(27) normal_xmax_outer_core
+  read(27) normal_ymin_outer_core
+  read(27) normal_ymax_outer_core
+  read(27) normal_bottom_outer_core
+  read(27) normal_top_outer_core
+
+  read(27) jacobian2D_xmin_outer_core
+  read(27) jacobian2D_xmax_outer_core
+  read(27) jacobian2D_ymin_outer_core
+  read(27) jacobian2D_ymax_outer_core
+  read(27) jacobian2D_bottom_outer_core
+  read(27) jacobian2D_top_outer_core
+  close(27)
+
+  !
+  ! inner core
+  !
+
+  ! create name of database
+  call create_name_database(prname,myrank,IREGION_INNER_CORE,LOCAL_PATH)
+
+  ! read info for vertical edges for central cube matching in inner core
+  open(unit=27,file=prname(1:len_trim(prname))//'boundary.bin', &
+        status='old',form='unformatted',action='read',iostat=ier)
+  if( ier /= 0 ) call exit_mpi(myrank,'error opening inner_core boundary.bin file')
+
+  read(27) nspec2D_xmin_inner_core
+  read(27) nspec2D_xmax_inner_core
+  read(27) nspec2D_ymin_inner_core
+  read(27) nspec2D_ymax_inner_core
+  read(27) njunk1
+  read(27) njunk2
+
+  ! boundary parameters
+  read(27) ibelm_xmin_inner_core
+  read(27) ibelm_xmax_inner_core
+  read(27) ibelm_ymin_inner_core
+  read(27) ibelm_ymax_inner_core
+  read(27) ibelm_bottom_inner_core
+  read(27) ibelm_top_inner_core
+  close(27)
+
+  ! -- Boundary Mesh for crust and mantle ---
+  if (SAVE_BOUNDARY_MESH .and. SIMULATION_TYPE == 3) then
+
+    call create_name_database(prname,myrank,IREGION_CRUST_MANTLE,LOCAL_PATH)
+
+    open(unit=27,file=prname(1:len_trim(prname))//'boundary_disc.bin', &
+          status='old',form='unformatted',action='read',iostat=ier)
+    if( ier /= 0 ) call exit_mpi(myrank,'error opening boundary_disc.bin file')
+
+    read(27) njunk1,njunk2,njunk3
+    if (njunk1 /= NSPEC2D_MOHO .and. njunk2 /= NSPEC2D_400 .and. njunk3 /= NSPEC2D_670) &
+               call exit_mpi(myrank, 'Error reading ibelm_disc.bin file')
+    read(27) ibelm_moho_top
+    read(27) ibelm_moho_bot
+    read(27) ibelm_400_top
+    read(27) ibelm_400_bot
+    read(27) ibelm_670_top
+    read(27) ibelm_670_bot
+    read(27) normal_moho
+    read(27) normal_400
+    read(27) normal_670
+    close(27)
+
+    k_top = 1
+    k_bot = NGLLZ
+
+    ! initialization
+    moho_kl = 0.; d400_kl = 0.; d670_kl = 0.; cmb_kl = 0.; icb_kl = 0.
+  endif
+
+  end subroutine read_mesh_databases_coupling
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+  subroutine read_mesh_databases_addressing()
+
+  use specfem_par
+  use specfem_par_crustmantle
+  use specfem_par_innercore
+  use specfem_par_outercore
+
+  implicit none
+
+  ! local parameters
+  integer, dimension(NCHUNKS_VAL,0:NPROC_XI_VAL-1,0:NPROC_ETA_VAL-1) :: addressing
+  integer, dimension(0:NPROCTOT_VAL-1) :: ichunk_slice,iproc_xi_slice,iproc_eta_slice
+  integer :: ier,iproc,iproc_read,iproc_xi,iproc_eta
+
+  ! open file with global slice number addressing
+  if(myrank == 0) then
+    open(unit=IIN,file=trim(OUTPUT_FILES)//'/addressing.txt',status='old',action='read',iostat=ier)
+    if( ier /= 0 ) call exit_mpi(myrank,'error opening addressing.txt')
+
+    do iproc = 0,NPROCTOT_VAL-1
+      read(IIN,*) iproc_read,ichunk,iproc_xi,iproc_eta
+
+      if(iproc_read /= iproc) call exit_MPI(myrank,'incorrect slice number read')
+
+      addressing(ichunk,iproc_xi,iproc_eta) = iproc
+      ichunk_slice(iproc) = ichunk
+      iproc_xi_slice(iproc) = iproc_xi
+      iproc_eta_slice(iproc) = iproc_eta
+    enddo
+    close(IIN)
+  endif
+
+  ! broadcast the information read on the master to the nodes
+  call bcast_all_i(addressing,NCHUNKS_VAL*NPROC_XI_VAL*NPROC_ETA_VAL)
+  call bcast_all_i(ichunk_slice,NPROCTOT_VAL)
+  call bcast_all_i(iproc_xi_slice,NPROCTOT_VAL)
+  call bcast_all_i(iproc_eta_slice,NPROCTOT_VAL)
+
+  ! output a topology map of slices - fix 20x by nproc
+  if (myrank == 0 ) then
+    if( NCHUNKS_VAL == 6 .and. NPROCTOT_VAL < 1000 ) then
+      write(IMAIN,*) 'Spatial distribution of the slices'
+      do iproc_xi = NPROC_XI_VAL-1, 0, -1
+        write(IMAIN,'(20x)',advance='no')
+        do iproc_eta = NPROC_ETA_VAL -1, 0, -1
+          ichunk = CHUNK_AB
+          write(IMAIN,'(i5)',advance='no') addressing(ichunk,iproc_xi,iproc_eta)
+        enddo
+        write(IMAIN,'(1x)',advance='yes')
+      enddo
+      write(IMAIN, *) ' '
+      do iproc_xi = NPROC_XI_VAL-1, 0, -1
+        write(IMAIN,'(1x)',advance='no')
+        do iproc_eta = NPROC_ETA_VAL -1, 0, -1
+          ichunk = CHUNK_BC
+          write(IMAIN,'(i5)',advance='no') addressing(ichunk,iproc_xi,iproc_eta)
+        enddo
+        write(IMAIN,'(3x)',advance='no')
+        do iproc_eta = NPROC_ETA_VAL -1, 0, -1
+          ichunk = CHUNK_AC
+          write(IMAIN,'(i5)',advance='no') addressing(ichunk,iproc_xi,iproc_eta)
+        enddo
+        write(IMAIN,'(3x)',advance='no')
+        do iproc_eta = NPROC_ETA_VAL -1, 0, -1
+          ichunk = CHUNK_BC_ANTIPODE
+          write(IMAIN,'(i5)',advance='no') addressing(ichunk,iproc_xi,iproc_eta)
+        enddo
+        write(IMAIN,'(1x)',advance='yes')
+      enddo
+      write(IMAIN, *) ' '
+      do iproc_xi = NPROC_XI_VAL-1, 0, -1
+        write(IMAIN,'(20x)',advance='no')
+        do iproc_eta = NPROC_ETA_VAL -1, 0, -1
+          ichunk = CHUNK_AB_ANTIPODE
+          write(IMAIN,'(i5)',advance='no') addressing(ichunk,iproc_xi,iproc_eta)
+        enddo
+        write(IMAIN,'(1x)',advance='yes')
+      enddo
+      write(IMAIN, *) ' '
+      do iproc_xi = NPROC_XI_VAL-1, 0, -1
+        write(IMAIN,'(20x)',advance='no')
+        do iproc_eta = NPROC_ETA_VAL -1, 0, -1
+          ichunk = CHUNK_AC_ANTIPODE
+          write(IMAIN,'(i5)',advance='no') addressing(ichunk,iproc_xi,iproc_eta)
+        enddo
+        write(IMAIN,'(1x)',advance='yes')
+      enddo
+      write(IMAIN, *) ' '
+    endif
+  endif
+
+  ! determine chunk number and local slice coordinates using addressing
+  ! (needed for Stacey conditions)
+  ichunk = ichunk_slice(myrank)
+
+  end subroutine read_mesh_databases_addressing
+
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+  subroutine read_mesh_databases_MPI()
+
+  use specfem_par
+  use specfem_par_crustmantle
+  use specfem_par_innercore
+  use specfem_par_outercore
+  implicit none
+
+  ! local parameters
+  real :: percentage_edge
+  integer :: ier
+
+  ! read MPI interfaces from file
+
+  ! crust mantle
+  if( ADIOS_ENABLED .and. ADIOS_FOR_MPI_ARRAYS ) then
+    call read_mesh_databases_MPI_CM_adios()
+  else
+    call read_mesh_databases_MPI_CM()
+  endif
+
+  allocate(buffer_send_vector_crust_mantle(NDIM,max_nibool_interfaces_cm,num_interfaces_crust_mantle), &
+          buffer_recv_vector_crust_mantle(NDIM,max_nibool_interfaces_cm,num_interfaces_crust_mantle), &
+          request_send_vector_cm(num_interfaces_crust_mantle), &
+          request_recv_vector_cm(num_interfaces_crust_mantle), &
+          stat=ier)
+  if( ier /= 0 ) call exit_mpi(myrank,'error allocating array buffer_send_vector_crust_mantle etc.')
+
+  if( SIMULATION_TYPE == 3 ) then
+    allocate(b_buffer_send_vector_cm(NDIM,max_nibool_interfaces_cm,num_interfaces_crust_mantle), &
+            b_buffer_recv_vector_cm(NDIM,max_nibool_interfaces_cm,num_interfaces_crust_mantle), &
+            b_request_send_vector_cm(num_interfaces_crust_mantle), &
+            b_request_recv_vector_cm(num_interfaces_crust_mantle), &
+            stat=ier)
+    if( ier /= 0 ) call exit_mpi(myrank,'error allocating array b_buffer_send_vector_cm etc.')
+  endif
+
+  ! outer core
+  if( ADIOS_ENABLED .and. ADIOS_FOR_MPI_ARRAYS ) then
+    call read_mesh_databases_MPI_OC_adios()
+  else
+    call read_mesh_databases_MPI_OC()
+  endif
+
+  allocate(buffer_send_scalar_outer_core(max_nibool_interfaces_oc,num_interfaces_outer_core), &
+          buffer_recv_scalar_outer_core(max_nibool_interfaces_oc,num_interfaces_outer_core), &
+          request_send_scalar_oc(num_interfaces_outer_core), &
+          request_recv_scalar_oc(num_interfaces_outer_core), &
+          stat=ier)
+  if( ier /= 0 ) call exit_mpi(myrank,'error allocating array buffer_send_vector_outer_core etc.')
+
+  if( SIMULATION_TYPE == 3 ) then
+    allocate(b_buffer_send_scalar_outer_core(max_nibool_interfaces_oc,num_interfaces_outer_core), &
+            b_buffer_recv_scalar_outer_core(max_nibool_interfaces_oc,num_interfaces_outer_core), &
+            b_request_send_scalar_oc(num_interfaces_outer_core), &
+            b_request_recv_scalar_oc(num_interfaces_outer_core), &
+            stat=ier)
+    if( ier /= 0 ) call exit_mpi(myrank,'error allocating array b_buffer_send_vector_outer_core etc.')
+  endif
+
+  ! inner core
+  if( ADIOS_ENABLED .and. ADIOS_FOR_MPI_ARRAYS ) then
+    call read_mesh_databases_MPI_IC_adios()
+  else
+    call read_mesh_databases_MPI_IC()
+  endif
+
+  allocate(buffer_send_vector_inner_core(NDIM,max_nibool_interfaces_ic,num_interfaces_inner_core), &
+          buffer_recv_vector_inner_core(NDIM,max_nibool_interfaces_ic,num_interfaces_inner_core), &
+          request_send_vector_ic(num_interfaces_inner_core), &
+          request_recv_vector_ic(num_interfaces_inner_core), &
+          stat=ier)
+  if( ier /= 0 ) call exit_mpi(myrank,'error allocating array buffer_send_vector_inner_core etc.')
+
+  if( SIMULATION_TYPE == 3 ) then
+    allocate(b_buffer_send_vector_inner_core(NDIM,max_nibool_interfaces_ic,num_interfaces_inner_core), &
+            b_buffer_recv_vector_inner_core(NDIM,max_nibool_interfaces_ic,num_interfaces_inner_core), &
+            b_request_send_vector_ic(num_interfaces_inner_core), &
+            b_request_recv_vector_ic(num_interfaces_inner_core), &
+            stat=ier)
+    if( ier /= 0 ) call exit_mpi(myrank,'error allocating array b_buffer_send_vector_inner_core etc.')
+  endif
+
+
+  ! user output
+  if(myrank == 0) then
+    write(IMAIN,*) 'for overlapping of communications with calculations:'
+    write(IMAIN,*)
+
+    percentage_edge = 100. * nspec_outer_crust_mantle / real(NSPEC_CRUST_MANTLE)
+    write(IMAIN,*) 'percentage of edge elements in crust/mantle ',percentage_edge,'%'
+    write(IMAIN,*) 'percentage of volume elements in crust/mantle ',100. - percentage_edge,'%'
+    write(IMAIN,*)
+
+    percentage_edge = 100.* nspec_outer_outer_core / real(NSPEC_OUTER_CORE)
+    write(IMAIN,*) 'percentage of edge elements in outer core ',percentage_edge,'%'
+    write(IMAIN,*) 'percentage of volume elements in outer core ',100. - percentage_edge,'%'
+    write(IMAIN,*)
+
+    percentage_edge = 100. * nspec_outer_inner_core / real(NSPEC_INNER_CORE)
+    write(IMAIN,*) 'percentage of edge elements in inner core ',percentage_edge,'%'
+    write(IMAIN,*) 'percentage of volume elements in inner core ',100. - percentage_edge,'%'
+    write(IMAIN,*)
+    call flush_IMAIN()
+  endif
+  ! synchronizes MPI processes
+  call sync_all()
+
+  end subroutine read_mesh_databases_MPI
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+  subroutine read_mesh_databases_MPI_CM()
+
+  use specfem_par
+  use specfem_par_crustmantle
+  implicit none
+
+  ! local parameters
+  integer :: ier
+
+  ! crust mantle region
+
+  ! create the name for the database of the current slide and region
+  call create_name_database(prname,myrank,IREGION_CRUST_MANTLE,LOCAL_PATH)
+
+  open(unit=IIN,file=prname(1:len_trim(prname))//'solver_data_mpi.bin', &
+       status='old',action='read',form='unformatted',iostat=ier)
+  if( ier /= 0 ) call exit_mpi(myrank,'error opening solver_data_mpi.bin')
+
+  ! MPI interfaces
+  read(IIN) num_interfaces_crust_mantle
+  allocate(my_neighbours_crust_mantle(num_interfaces_crust_mantle), &
+          nibool_interfaces_crust_mantle(num_interfaces_crust_mantle), &
+          stat=ier)
+  if( ier /= 0 ) &
+    call exit_mpi(myrank,'error allocating array my_neighbours_crust_mantle etc.')
+
+  if( num_interfaces_crust_mantle > 0 ) then
+    read(IIN) max_nibool_interfaces_cm
+    allocate(ibool_interfaces_crust_mantle(max_nibool_interfaces_cm,num_interfaces_crust_mantle), &
+            stat=ier)
+    if( ier /= 0 ) call exit_mpi(myrank,'error allocating array ibool_interfaces_crust_mantle')
+
+    read(IIN) my_neighbours_crust_mantle
+    read(IIN) nibool_interfaces_crust_mantle
+    read(IIN) ibool_interfaces_crust_mantle
+  else
+    ! dummy array
+    max_nibool_interfaces_cm = 0
+    allocate(ibool_interfaces_crust_mantle(0,0),stat=ier)
+    if( ier /= 0 ) call exit_mpi(myrank,'error allocating array dummy ibool_interfaces_crust_mantle')
+  endif
+
+  ! inner / outer elements
+  read(IIN) nspec_inner_crust_mantle,nspec_outer_crust_mantle
+  read(IIN) num_phase_ispec_crust_mantle
+  if( num_phase_ispec_crust_mantle < 0 ) &
+    call exit_mpi(myrank,'error num_phase_ispec_crust_mantle is < zero')
+
+  allocate(phase_ispec_inner_crust_mantle(num_phase_ispec_crust_mantle,2),&
+          stat=ier)
+  if( ier /= 0 ) &
+    call exit_mpi(myrank,'error allocating array phase_ispec_inner_crust_mantle')
+
+  if(num_phase_ispec_crust_mantle > 0 ) read(IIN) phase_ispec_inner_crust_mantle
+
+  ! mesh coloring for GPUs
+  if( USE_MESH_COLORING_GPU ) then
+    ! colors
+    read(IIN) num_colors_outer_crust_mantle,num_colors_inner_crust_mantle
+
+    allocate(num_elem_colors_crust_mantle(num_colors_outer_crust_mantle + num_colors_inner_crust_mantle), &
+            stat=ier)
+    if( ier /= 0 ) &
+      call exit_mpi(myrank,'error allocating num_elem_colors_crust_mantle array')
+
+    read(IIN) num_elem_colors_crust_mantle
+  else
+    ! allocates dummy arrays
+    num_colors_outer_crust_mantle = 0
+    num_colors_inner_crust_mantle = 0
+    allocate(num_elem_colors_crust_mantle(num_colors_outer_crust_mantle + num_colors_inner_crust_mantle), &
+            stat=ier)
+    if( ier /= 0 ) &
+      call exit_mpi(myrank,'error allocating num_elem_colors_crust_mantle array')
+  endif
+
+  close(IIN)
+
+  end subroutine read_mesh_databases_MPI_CM
+
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+  subroutine read_mesh_databases_MPI_OC()
+
+  use specfem_par
+  use specfem_par_outercore
+  implicit none
+
+  ! local parameters
+  integer :: ier
+
+  ! crust mantle region
+
+  ! create the name for the database of the current slide and region
+  call create_name_database(prname,myrank,IREGION_OUTER_CORE,LOCAL_PATH)
+
+  open(unit=IIN,file=prname(1:len_trim(prname))//'solver_data_mpi.bin', &
+       status='old',action='read',form='unformatted',iostat=ier)
+  if( ier /= 0 ) call exit_mpi(myrank,'error opening solver_data_mpi.bin')
+
+  ! MPI interfaces
+  read(IIN) num_interfaces_outer_core
+  allocate(my_neighbours_outer_core(num_interfaces_outer_core), &
+          nibool_interfaces_outer_core(num_interfaces_outer_core), &
+          stat=ier)
+  if( ier /= 0 ) &
+    call exit_mpi(myrank,'error allocating array my_neighbours_outer_core etc.')
+
+  if( num_interfaces_outer_core > 0 ) then
+    read(IIN) max_nibool_interfaces_oc
+    allocate(ibool_interfaces_outer_core(max_nibool_interfaces_oc,num_interfaces_outer_core), &
+            stat=ier)
+    if( ier /= 0 ) call exit_mpi(myrank,'error allocating array ibool_interfaces_outer_core')
+
+    read(IIN) my_neighbours_outer_core
+    read(IIN) nibool_interfaces_outer_core
+    read(IIN) ibool_interfaces_outer_core
+  else
+    ! dummy array
+    max_nibool_interfaces_oc = 0
+    allocate(ibool_interfaces_outer_core(0,0),stat=ier)
+    if( ier /= 0 ) call exit_mpi(myrank,'error allocating array dummy ibool_interfaces_outer_core')
+  endif
+
+  ! inner / outer elements
+  read(IIN) nspec_inner_outer_core,nspec_outer_outer_core
+  read(IIN) num_phase_ispec_outer_core
+  if( num_phase_ispec_outer_core < 0 ) &
+    call exit_mpi(myrank,'error num_phase_ispec_outer_core is < zero')
+
+  allocate(phase_ispec_inner_outer_core(num_phase_ispec_outer_core,2),&
+          stat=ier)
+  if( ier /= 0 ) &
+    call exit_mpi(myrank,'error allocating array phase_ispec_inner_outer_core')
+
+  if(num_phase_ispec_outer_core > 0 ) read(IIN) phase_ispec_inner_outer_core
+
+  ! mesh coloring for GPUs
+  if( USE_MESH_COLORING_GPU ) then
+    ! colors
+    read(IIN) num_colors_outer_outer_core,num_colors_inner_outer_core
+
+    allocate(num_elem_colors_outer_core(num_colors_outer_outer_core + num_colors_inner_outer_core), &
+            stat=ier)
+    if( ier /= 0 ) &
+      call exit_mpi(myrank,'error allocating num_elem_colors_outer_core array')
+
+    read(IIN) num_elem_colors_outer_core
+  else
+    ! allocates dummy arrays
+    num_colors_outer_outer_core = 0
+    num_colors_inner_outer_core = 0
+    allocate(num_elem_colors_outer_core(num_colors_outer_outer_core + num_colors_inner_outer_core), &
+            stat=ier)
+    if( ier /= 0 ) &
+      call exit_mpi(myrank,'error allocating num_elem_colors_outer_core array')
+  endif
+
+  close(IIN)
+
+  end subroutine read_mesh_databases_MPI_OC
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+  subroutine read_mesh_databases_MPI_IC()
+
+  use specfem_par
+  use specfem_par_innercore
+  implicit none
+
+  ! local parameters
+  integer :: ier
+
+  ! crust mantle region
+
+  ! create the name for the database of the current slide and region
+  call create_name_database(prname,myrank,IREGION_INNER_CORE,LOCAL_PATH)
+
+  open(unit=IIN,file=prname(1:len_trim(prname))//'solver_data_mpi.bin', &
+       status='old',action='read',form='unformatted',iostat=ier)
+  if( ier /= 0 ) call exit_mpi(myrank,'error opening solver_data_mpi.bin')
+
+  ! MPI interfaces
+  read(IIN) num_interfaces_inner_core
+  allocate(my_neighbours_inner_core(num_interfaces_inner_core), &
+          nibool_interfaces_inner_core(num_interfaces_inner_core), &
+          stat=ier)
+  if( ier /= 0 ) &
+    call exit_mpi(myrank,'error allocating array my_neighbours_inner_core etc.')
+
+  if( num_interfaces_inner_core > 0 ) then
+    read(IIN) max_nibool_interfaces_ic
+    allocate(ibool_interfaces_inner_core(max_nibool_interfaces_ic,num_interfaces_inner_core), &
+            stat=ier)
+    if( ier /= 0 ) call exit_mpi(myrank,'error allocating array ibool_interfaces_inner_core')
+
+    read(IIN) my_neighbours_inner_core
+    read(IIN) nibool_interfaces_inner_core
+    read(IIN) ibool_interfaces_inner_core
+  else
+    ! dummy array
+    max_nibool_interfaces_ic = 0
+    allocate(ibool_interfaces_inner_core(0,0),stat=ier)
+    if( ier /= 0 ) call exit_mpi(myrank,'error allocating array dummy ibool_interfaces_inner_core')
+  endif
+
+  ! inner / outer elements
+  read(IIN) nspec_inner_inner_core,nspec_outer_inner_core
+  read(IIN) num_phase_ispec_inner_core
+  if( num_phase_ispec_inner_core < 0 ) &
+    call exit_mpi(myrank,'error num_phase_ispec_inner_core is < zero')
+
+  allocate(phase_ispec_inner_inner_core(num_phase_ispec_inner_core,2),&
+          stat=ier)
+  if( ier /= 0 ) &
+    call exit_mpi(myrank,'error allocating array phase_ispec_inner_inner_core')
+
+  if(num_phase_ispec_inner_core > 0 ) read(IIN) phase_ispec_inner_inner_core
+
+  ! mesh coloring for GPUs
+  if( USE_MESH_COLORING_GPU ) then
+    ! colors
+    read(IIN) num_colors_outer_inner_core,num_colors_inner_inner_core
+
+    allocate(num_elem_colors_inner_core(num_colors_outer_inner_core + num_colors_inner_inner_core), &
+            stat=ier)
+    if( ier /= 0 ) &
+      call exit_mpi(myrank,'error allocating num_elem_colors_inner_core array')
+
+    read(IIN) num_elem_colors_inner_core
+  else
+    ! allocates dummy arrays
+    num_colors_outer_inner_core = 0
+    num_colors_inner_inner_core = 0
+    allocate(num_elem_colors_inner_core(num_colors_outer_inner_core + num_colors_inner_inner_core), &
+            stat=ier)
+    if( ier /= 0 ) &
+      call exit_mpi(myrank,'error allocating num_elem_colors_inner_core array')
+  endif
+
+  close(IIN)
+
+  end subroutine read_mesh_databases_MPI_IC
+
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+  subroutine read_mesh_databases_stacey()
+
+  use specfem_par
+  use specfem_par_crustmantle
+  use specfem_par_innercore
+  use specfem_par_outercore
+
+  implicit none
+
+  ! local parameters
+  integer :: ier
+
+  ! crust and mantle
+
+  ! create name of database
+  call create_name_database(prname,myrank,IREGION_CRUST_MANTLE,LOCAL_PATH)
+
+  ! read arrays for Stacey conditions
+  open(unit=27,file=prname(1:len_trim(prname))//'stacey.bin', &
+        status='old',form='unformatted',action='read',iostat=ier)
+  if( ier /= 0 ) call exit_MPI(myrank,'error opening stacey.bin file for crust mantle')
+
+  read(27) nimin_crust_mantle
+  read(27) nimax_crust_mantle
+  read(27) njmin_crust_mantle
+  read(27) njmax_crust_mantle
+  read(27) nkmin_xi_crust_mantle
+  read(27) nkmin_eta_crust_mantle
+  close(27)
+
+  ! outer core
+
+  ! create name of database
+  call create_name_database(prname,myrank,IREGION_OUTER_CORE,LOCAL_PATH)
+
+  ! read arrays for Stacey conditions
+  open(unit=27,file=prname(1:len_trim(prname))//'stacey.bin', &
+        status='old',form='unformatted',action='read',iostat=ier)
+  if( ier /= 0 ) call exit_MPI(myrank,'error opening stacey.bin file for outer core')
+
+  read(27) nimin_outer_core
+  read(27) nimax_outer_core
+  read(27) njmin_outer_core
+  read(27) njmax_outer_core
+  read(27) nkmin_xi_outer_core
+  read(27) nkmin_eta_outer_core
+  close(27)
+
+  end subroutine read_mesh_databases_stacey
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+  subroutine read_mesh_databases_regular_kl()
+
+  use specfem_par
+  use specfem_par_crustmantle
+  use specfem_par_innercore
+  use specfem_par_outercore
+
+  implicit none
+
+  ! local parameters
+  integer, dimension(:), allocatable :: slice_number
+  integer :: i,isp,ier
+  ! grid parameters
+  type kl_reg_grid_variables
+    sequence
+    real dlat
+    real dlon
+    integer nlayer
+    real rlayer(NM_KL_REG_LAYER)
+    integer ndoubling(NM_KL_REG_LAYER)
+    integer nlat(NM_KL_REG_LAYER)
+    integer nlon(NM_KL_REG_LAYER)
+    integer npts_total
+    integer npts_before_layer(NM_KL_REG_LAYER+1)
+  end type kl_reg_grid_variables
+  type (kl_reg_grid_variables) KL_REG_GRID
+
+  call read_kl_regular_grid(myrank, KL_REG_GRID)
+
+  if( myrank == 0 ) then
+    ! master process
+    allocate(slice_number(KL_REG_GRID%npts_total),stat=ier)
+    if( ier /= 0 ) call exit_MPI(myrank,'error allocating slice_number array')
+
+    ! print *, 'slice npts =', KL_REG_GRID%npts_total
+    call find_regular_grid_slice_number(slice_number, KL_REG_GRID, NCHUNKS_VAL, &
+                                        NPROC_XI_VAL, NPROC_ETA_VAL)
+
+    do i = NPROCTOT_VAL-1,0,-1
+      npoints_slice = 0
+      do isp = 1,KL_REG_GRID%npts_total
+        if (slice_number(isp) == i) then
+          npoints_slice = npoints_slice + 1
+          if (npoints_slice > NM_KL_REG_PTS) stop 'Exceeding NM_KL_REG_PTS limit'
+          points_slice(npoints_slice) = isp
+        endif
+      enddo
+
+      if (i /= 0) then
+        call send_singlei(npoints_slice,i,i)
+        if (npoints_slice > 0) then
+          call send_i(points_slice,npoints_slice,i,2*i)
+        endif
+      endif
+    enddo
+
+    open(unit=IOUT,file=trim(OUTPUT_FILES)//'/kl_grid_slice.txt',status='unknown',action='write',iostat=ier)
+    if( ier /= 0 ) call exit_MPI(myrank,'error opening file kl_grid_slice.txt for writing')
+    write(IOUT,*) slice_number
+    close(IOUT)
+
+    deallocate(slice_number)
+  else
+    ! slave processes
+    call recv_singlei(npoints_slice,0,myrank)
+    if (npoints_slice > 0) then
+      call recv_i(points_slice,npoints_slice,0,2*myrank)
+    endif
+  endif
+
+  ! this is the core part that takes up most of the computation time,
+  ! and presumably the more processors involved the faster.
+  if (npoints_slice > 0) then
+    call locate_regular_points(npoints_slice, points_slice, KL_REG_GRID, &
+                           NEX_XI, NSPEC_CRUST_MANTLE, &
+                           xstore_crust_mantle, ystore_crust_mantle, zstore_crust_mantle, &
+                           ibool_crust_mantle, &
+                           xigll, yigll, zigll, &
+                           ispec_reg, hxir_reg, hetar_reg, hgammar_reg)
+  endif
+
+  ! user output
+  if (myrank==0) then
+    write(IMAIN,*) ' '
+    write(IMAIN,*) 'Finished locating kernel output regular grid'
+    write(IMAIN,*) ' '
+    call flush_IMAIN()
+  endif
+
+  end subroutine read_mesh_databases_regular_kl
+
+

Deleted: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/read_mesh_databases.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/read_mesh_databases.f90	2013-08-30 00:30:39 UTC (rev 22745)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/read_mesh_databases.f90	2013-08-30 12:45:44 UTC (rev 22746)
@@ -1,1127 +0,0 @@
-!=====================================================================
-!
-!          S p e c f e m 3 D  G l o b e  V e r s i o n  6 . 0
-!          --------------------------------------------------
-!
-!          Main authors: Dimitri Komatitsch and Jeroen Tromp
-!                        Princeton University, USA
-!             and CNRS / INRIA / University of Pau, France
-! (c) Princeton University and CNRS / INRIA / University of Pau
-!                            August 2013
-!
-! This program is free software; you can redistribute it and/or modify
-! it under the terms of the GNU General Public License as published by
-! the Free Software Foundation; either version 2 of the License, or
-! (at your option) any later version.
-!
-! This program is distributed in the hope that it will be useful,
-! but WITHOUT ANY WARRANTY; without even the implied warranty of
-! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-! GNU General Public License for more details.
-!
-! You should have received a copy of the GNU General Public License along
-! with this program; if not, write to the Free Software Foundation, Inc.,
-! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-!
-!=====================================================================
-
-  subroutine read_mesh_databases()
-
-  use specfem_par
-  use specfem_par_crustmantle
-  use specfem_par_innercore
-  use specfem_par_outercore
-
-  implicit none
-
-  ! local parameters
-  ! timing
-  double precision, external :: wtime
-
-  ! get MPI starting time
-  time_start = wtime()
-
-  ! start reading the databases
-  ! read arrays created by the mesher
-
-  ! reads "solver_data.bin" files for crust and mantle
-  call read_mesh_databases_CM()
-
-  ! reads "solver_data.bin" files for outer core
-  call read_mesh_databases_OC()
-
-  ! reads "solver_data.bin" files for inner core
-  call read_mesh_databases_IC()
-
-  ! reads "boundary.bin" files to couple mantle with outer core and inner core boundaries
-  if( ADIOS_ENABLED .and. ADIOS_FOR_ARRAYS_SOLVER ) then
-    call read_mesh_databases_coupling_adios()
-  else
-    call read_mesh_databases_coupling()
-  endif
-
-  ! reads "addressing.txt" 2-D addressing (needed for Stacey boundaries)
-  call read_mesh_databases_addressing()
-
-  ! sets up MPI interfaces, inner/outer elements and mesh coloring
-  call read_mesh_databases_MPI()
-
-  ! absorbing boundaries
-  if(ABSORBING_CONDITIONS) then
-    ! reads "stacey.bin" files
-    if( ADIOS_ENABLED .and. ADIOS_FOR_ARRAYS_SOLVER ) then
-      call read_mesh_databases_stacey_adios()
-    else
-      call read_mesh_databases_stacey()
-    endif
-  endif
-
-  ! user output
-  call sync_all()
-  if( myrank == 0 ) then
-    ! elapsed time since beginning of mesh generation
-    tCPU = wtime() - time_start
-    write(IMAIN,*)
-    write(IMAIN,*) 'Elapsed time for reading mesh in seconds = ',sngl(tCPU)
-    write(IMAIN,*)
-    call flush_IMAIN()
-  endif
-
-  end subroutine read_mesh_databases
-
-!
-!-------------------------------------------------------------------------------------------------
-!
-
-  subroutine read_mesh_databases_CM()
-
-! mesh for CRUST MANTLE region
-
-  use specfem_par
-  use specfem_par_crustmantle
-  implicit none
-
-  ! local parameters
-  integer :: nspec_iso,nspec_tiso,nspec_ani
-  logical :: READ_KAPPA_MU,READ_TISO
-  ! dummy array that does not need to be actually read
-  integer, dimension(:),allocatable :: dummy_idoubling
-  integer :: ier
-
-  ! crust and mantle
-
-  if(ANISOTROPIC_3D_MANTLE_VAL) then
-    READ_KAPPA_MU = .false.
-    READ_TISO = .false.
-    nspec_iso = NSPECMAX_ISO_MANTLE ! 1
-    nspec_tiso = NSPECMAX_TISO_MANTLE ! 1
-    nspec_ani = NSPEC_CRUST_MANTLE
-  else
-    READ_KAPPA_MU = .true.
-    nspec_iso = NSPEC_CRUST_MANTLE
-    if(TRANSVERSE_ISOTROPY_VAL) then
-      nspec_tiso = NSPECMAX_TISO_MANTLE
-    else
-      nspec_tiso = 1
-    endif
-    nspec_ani = NSPECMAX_ANISO_MANTLE ! 1
-    READ_TISO = .true.
-  endif
-
-  ! sets number of top elements for surface movies & noise tomography
-  NSPEC_TOP = NSPEC2D_TOP(IREGION_CRUST_MANTLE)
-
-  ! allocates mass matrices in this slice (will be fully assembled in the solver)
-  !
-  ! in the case of Stacey boundary conditions, add C*deltat/2 contribution to the mass matrix
-  ! on Stacey edges for the crust_mantle and outer_core regions but not for the inner_core region
-  ! thus the mass matrix must be replaced by three mass matrices including the "C" damping matrix
-  !
-  ! if absorbing_conditions are not set or if NCHUNKS=6, only one mass matrix is needed
-  ! for the sake of performance, only "rmassz" array will be filled and "rmassx" & "rmassy" will be obsolete
-
-  ! allocates dummy array
-  allocate(dummy_idoubling(NSPEC_CRUST_MANTLE),stat=ier)
-  if( ier /= 0 ) call exit_mpi(myrank,'error allocating dummy idoubling in crust_mantle')
-
-  ! allocates mass matrices
-  allocate(rmassx_crust_mantle(NGLOB_XY_CM), &
-           rmassy_crust_mantle(NGLOB_XY_CM),stat=ier)
-  if(ier /= 0) stop 'error allocating rmassx, rmassy in crust_mantle'
-
-  allocate(rmassz_crust_mantle(NGLOB_CRUST_MANTLE),stat=ier)
-  if(ier /= 0) stop 'error allocating rmassz in crust_mantle'
-
-  allocate(b_rmassx_crust_mantle(NGLOB_XY_CM), &
-           b_rmassy_crust_mantle(NGLOB_XY_CM),stat=ier)
-  if(ier /= 0) stop 'error allocating b_rmassx, b_rmassy in crust_mantle'
-
-  ! reads databases file
-  if( ADIOS_ENABLED .and. ADIOS_FOR_ARRAYS_SOLVER ) then
-    call read_arrays_solver_adios(IREGION_CRUST_MANTLE,myrank, &
-        NSPEC_CRUST_MANTLE,NGLOB_CRUST_MANTLE,NGLOB_XY_CM, &
-        nspec_iso,nspec_tiso,nspec_ani, &
-        rho_vp_crust_mantle,rho_vs_crust_mantle, &
-        xstore_crust_mantle,ystore_crust_mantle,zstore_crust_mantle, &
-        xix_crust_mantle,xiy_crust_mantle,xiz_crust_mantle, &
-        etax_crust_mantle,etay_crust_mantle,etaz_crust_mantle, &
-        gammax_crust_mantle,gammay_crust_mantle,gammaz_crust_mantle, &
-        rhostore_crust_mantle,kappavstore_crust_mantle,muvstore_crust_mantle, &
-        kappahstore_crust_mantle,muhstore_crust_mantle,eta_anisostore_crust_mantle, &
-        c11store_crust_mantle,c12store_crust_mantle,c13store_crust_mantle, &
-        c14store_crust_mantle,c15store_crust_mantle,c16store_crust_mantle, &
-        c22store_crust_mantle,c23store_crust_mantle,c24store_crust_mantle, &
-        c25store_crust_mantle,c26store_crust_mantle,c33store_crust_mantle, &
-        c34store_crust_mantle,c35store_crust_mantle,c36store_crust_mantle, &
-        c44store_crust_mantle,c45store_crust_mantle,c46store_crust_mantle, &
-        c55store_crust_mantle,c56store_crust_mantle,c66store_crust_mantle, &
-        ibool_crust_mantle,dummy_idoubling,ispec_is_tiso_crust_mantle, &
-        rmassx_crust_mantle,rmassy_crust_mantle,rmassz_crust_mantle,rmass_ocean_load, &
-        READ_KAPPA_MU,READ_TISO, &
-        b_rmassx_crust_mantle,b_rmassy_crust_mantle)
-  else
-    call read_arrays_solver(IREGION_CRUST_MANTLE,myrank, &
-        NSPEC_CRUST_MANTLE,NGLOB_CRUST_MANTLE,NGLOB_XY_CM, &
-        nspec_iso,nspec_tiso,nspec_ani, &
-        rho_vp_crust_mantle,rho_vs_crust_mantle, &
-        xstore_crust_mantle,ystore_crust_mantle,zstore_crust_mantle, &
-        xix_crust_mantle,xiy_crust_mantle,xiz_crust_mantle, &
-        etax_crust_mantle,etay_crust_mantle,etaz_crust_mantle, &
-        gammax_crust_mantle,gammay_crust_mantle,gammaz_crust_mantle, &
-        rhostore_crust_mantle,kappavstore_crust_mantle,muvstore_crust_mantle, &
-        kappahstore_crust_mantle,muhstore_crust_mantle,eta_anisostore_crust_mantle, &
-        c11store_crust_mantle,c12store_crust_mantle,c13store_crust_mantle, &
-        c14store_crust_mantle,c15store_crust_mantle,c16store_crust_mantle, &
-        c22store_crust_mantle,c23store_crust_mantle,c24store_crust_mantle, &
-        c25store_crust_mantle,c26store_crust_mantle,c33store_crust_mantle, &
-        c34store_crust_mantle,c35store_crust_mantle,c36store_crust_mantle, &
-        c44store_crust_mantle,c45store_crust_mantle,c46store_crust_mantle, &
-        c55store_crust_mantle,c56store_crust_mantle,c66store_crust_mantle, &
-        ibool_crust_mantle,dummy_idoubling,ispec_is_tiso_crust_mantle, &
-        rmassx_crust_mantle,rmassy_crust_mantle,rmassz_crust_mantle,rmass_ocean_load, &
-        READ_KAPPA_MU,READ_TISO, &
-        b_rmassx_crust_mantle,b_rmassy_crust_mantle)
-  endif
-
-  ! check that the number of points in this slice is correct
-  if(minval(ibool_crust_mantle(:,:,:,:)) /= 1) &
-      call exit_MPI(myrank,'incorrect global numbering: iboolmin is not equal to 1 in crust and mantle')
-  if(maxval(ibool_crust_mantle(:,:,:,:)) /= NGLOB_CRUST_MANTLE) &
-      call exit_MPI(myrank,'incorrect global numbering: iboolmax does not equal nglob in crust and mantle')
-
-  deallocate(dummy_idoubling)
-
-  end subroutine read_mesh_databases_CM
-
-!
-!-------------------------------------------------------------------------------------------------
-!
-
-  subroutine read_mesh_databases_OC()
-
-! mesh for OUTER CORE region
-
-  use specfem_par
-  use specfem_par_outercore
-
-  implicit none
-
-  ! local parameters
-  integer :: nspec_iso,nspec_tiso,nspec_ani,NGLOB_XY_dummy
-  logical :: READ_KAPPA_MU,READ_TISO
-  integer :: ier
-
-  ! dummy array that does not need to be actually read
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,1) :: dummy_array
-  real(kind=CUSTOM_REAL), dimension(:), allocatable :: dummy_rmass
-
-  logical, dimension(:), allocatable :: dummy_ispec_is_tiso
-  integer, dimension(:), allocatable :: dummy_idoubling_outer_core
-
-  ! outer core (no anisotropy nor S velocity)
-  ! rmass_ocean_load is not used in this routine because it is meaningless in the outer core
-  READ_KAPPA_MU = .false.
-  READ_TISO = .false.
-  nspec_iso = NSPEC_OUTER_CORE
-  nspec_tiso = 1
-  nspec_ani = 1
-
-  ! dummy allocation
-  NGLOB_XY_dummy = 1
-
-  allocate(dummy_rmass(NGLOB_XY_dummy), &
-          dummy_ispec_is_tiso(NSPEC_OUTER_CORE), &
-          dummy_idoubling_outer_core(NSPEC_OUTER_CORE), &
-          stat=ier)
-  if(ier /= 0) stop 'error allocating dummy rmass and dummy ispec/idoubling in outer core'
-
-  ! allocates mass matrices in this slice (will be fully assembled in the solver)
-  !
-  ! in the case of Stacey boundary conditions, add C*deltat/2 contribution to the mass matrix
-  ! on Stacey edges for the crust_mantle and outer_core regions but not for the inner_core region
-  ! thus the mass matrix must be replaced by three mass matrices including the "C" damping matrix
-  !
-  ! if absorbing_conditions are not set or if NCHUNKS=6, only one mass matrix is needed
-  ! for the sake of performance, only "rmassz" array will be filled and "rmassx" & "rmassy" will be obsolete
-  allocate(rmass_outer_core(NGLOB_OUTER_CORE),stat=ier)
-  if(ier /= 0) stop 'error allocating rmass in outer core'
-
-  if( ADIOS_ENABLED .and. ADIOS_FOR_ARRAYS_SOLVER ) then
-    call read_arrays_solver_adios(IREGION_OUTER_CORE,myrank, &
-              NSPEC_OUTER_CORE,NGLOB_OUTER_CORE,NGLOB_XY_dummy, &
-              nspec_iso,nspec_tiso,nspec_ani, &
-              vp_outer_core,dummy_array, &
-              xstore_outer_core,ystore_outer_core,zstore_outer_core, &
-              xix_outer_core,xiy_outer_core,xiz_outer_core, &
-              etax_outer_core,etay_outer_core,etaz_outer_core, &
-              gammax_outer_core,gammay_outer_core,gammaz_outer_core, &
-              rhostore_outer_core,kappavstore_outer_core,dummy_array, &
-              dummy_array,dummy_array,dummy_array, &
-              dummy_array,dummy_array,dummy_array, &
-              dummy_array,dummy_array,dummy_array, &
-              dummy_array,dummy_array,dummy_array, &
-              dummy_array,dummy_array,dummy_array, &
-              dummy_array,dummy_array,dummy_array, &
-              dummy_array,dummy_array,dummy_array, &
-              dummy_array,dummy_array,dummy_array, &
-              ibool_outer_core,dummy_idoubling_outer_core,dummy_ispec_is_tiso, &
-              dummy_rmass,dummy_rmass,rmass_outer_core,rmass_ocean_load, &
-              READ_KAPPA_MU,READ_TISO, &
-              dummy_rmass,dummy_rmass)
-  else
-    call read_arrays_solver(IREGION_OUTER_CORE,myrank, &
-              NSPEC_OUTER_CORE,NGLOB_OUTER_CORE,NGLOB_XY_dummy, &
-              nspec_iso,nspec_tiso,nspec_ani, &
-              vp_outer_core,dummy_array, &
-              xstore_outer_core,ystore_outer_core,zstore_outer_core, &
-              xix_outer_core,xiy_outer_core,xiz_outer_core, &
-              etax_outer_core,etay_outer_core,etaz_outer_core, &
-              gammax_outer_core,gammay_outer_core,gammaz_outer_core, &
-              rhostore_outer_core,kappavstore_outer_core,dummy_array, &
-              dummy_array,dummy_array,dummy_array, &
-              dummy_array,dummy_array,dummy_array, &
-              dummy_array,dummy_array,dummy_array, &
-              dummy_array,dummy_array,dummy_array, &
-              dummy_array,dummy_array,dummy_array, &
-              dummy_array,dummy_array,dummy_array, &
-              dummy_array,dummy_array,dummy_array, &
-              dummy_array,dummy_array,dummy_array, &
-              ibool_outer_core,dummy_idoubling_outer_core,dummy_ispec_is_tiso, &
-              dummy_rmass,dummy_rmass,rmass_outer_core,rmass_ocean_load, &
-              READ_KAPPA_MU,READ_TISO, &
-              dummy_rmass,dummy_rmass)
-  endif
-
-  deallocate(dummy_idoubling_outer_core,dummy_ispec_is_tiso,dummy_rmass)
-
-  ! check that the number of points in this slice is correct
-  ! check that the number of points in this slice is correct
-  if(minval(ibool_outer_core(:,:,:,:)) /= 1) &
-      call exit_MPI(myrank,'incorrect global numbering: iboolmin is not equal to 1 in outer core')
-  if(maxval(ibool_outer_core(:,:,:,:)) /= NGLOB_OUTER_CORE) then
-    call exit_MPI(myrank, 'incorrect global numbering: &
-        & iboolmax does not equal nglob in outer core')
-  endif
-
-  end subroutine read_mesh_databases_OC
-
-!
-!-------------------------------------------------------------------------------------------------
-!
-
-  subroutine read_mesh_databases_IC()
-
-! mesh for INNER CORE region
-
-  use specfem_par
-  use specfem_par_innercore
-  implicit none
-
-  ! local parameters
-  integer :: nspec_iso,nspec_tiso,nspec_ani
-  logical :: READ_KAPPA_MU,READ_TISO
-  integer :: ier
-
-  ! dummy array that does not need to be actually read
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,1) :: dummy_array
-  logical, dimension(:),allocatable:: dummy_ispec_is_tiso
-
-  ! inner core (no anisotropy)
-  ! rmass_ocean_load is not used in this routine because it is meaningless in the inner core
-  READ_KAPPA_MU = .true. ! (muvstore needed for attenuation)
-  READ_TISO = .false.
-  nspec_iso = NSPEC_INNER_CORE
-  nspec_tiso = 1
-  if(ANISOTROPIC_INNER_CORE_VAL) then
-    nspec_ani = NSPEC_INNER_CORE
-  else
-    nspec_ani = 1
-  endif
-
-  allocate(dummy_ispec_is_tiso(NSPEC_INNER_CORE), &
-           stat=ier)
-  if(ier /= 0) stop 'error allocating dummy ispec in inner core'
-
-  ! allocates mass matrices in this slice (will be fully assembled in the solver)
-  !
-  ! in the case of Stacey boundary conditions, add C*deltat/2 contribution to the mass matrix
-  ! on Stacey edges for the crust_mantle and outer_core regions but not for the inner_core region
-  ! thus the mass matrix must be replaced by three mass matrices including the "C" damping matrix
-  !
-  ! if absorbing_conditions are not set or if NCHUNKS=6, only one mass matrix is needed
-  ! for the sake of performance, only "rmassz" array will be filled and "rmassx" & "rmassy" will be obsolete
-  allocate(rmassx_inner_core(NGLOB_XY_IC), &
-           rmassy_inner_core(NGLOB_XY_IC),stat=ier)
-  if(ier /= 0) stop 'error allocating rmassx, rmassy in inner_core'
-
-  allocate(rmass_inner_core(NGLOB_INNER_CORE),stat=ier)
-  if(ier /= 0) stop 'error allocating rmass in inner core'
-
-  allocate(b_rmassx_inner_core(NGLOB_XY_IC), &
-           b_rmassy_inner_core(NGLOB_XY_IC),stat=ier)
-  if(ier /= 0) stop 'error allocating b_rmassx, b_rmassy in inner_core'
-
-  ! reads in arrays
-  if( ADIOS_ENABLED .and. ADIOS_FOR_ARRAYS_SOLVER ) then
-    call read_arrays_solver_adios(IREGION_INNER_CORE,myrank, &
-              NSPEC_INNER_CORE,NGLOB_INNER_CORE,NGLOB_XY_IC, &
-              nspec_iso,nspec_tiso,nspec_ani, &
-              dummy_array,dummy_array, &
-              xstore_inner_core,ystore_inner_core,zstore_inner_core, &
-              xix_inner_core,xiy_inner_core,xiz_inner_core, &
-              etax_inner_core,etay_inner_core,etaz_inner_core, &
-              gammax_inner_core,gammay_inner_core,gammaz_inner_core, &
-              rhostore_inner_core,kappavstore_inner_core,muvstore_inner_core, &
-              dummy_array,dummy_array,dummy_array, &
-              c11store_inner_core,c12store_inner_core,c13store_inner_core, &
-              dummy_array,dummy_array,dummy_array, &
-              dummy_array,dummy_array,dummy_array, &
-              dummy_array,dummy_array,c33store_inner_core, &
-              dummy_array,dummy_array,dummy_array, &
-              c44store_inner_core,dummy_array,dummy_array, &
-              dummy_array,dummy_array,dummy_array, &
-              ibool_inner_core,idoubling_inner_core,dummy_ispec_is_tiso, &
-              rmassx_inner_core,rmassy_inner_core,rmass_inner_core,rmass_ocean_load, &
-              READ_KAPPA_MU,READ_TISO, &
-              b_rmassx_inner_core,b_rmassy_inner_core)
-  else
-    call read_arrays_solver(IREGION_INNER_CORE,myrank, &
-              NSPEC_INNER_CORE,NGLOB_INNER_CORE,NGLOB_XY_IC, &
-              nspec_iso,nspec_tiso,nspec_ani, &
-              dummy_array,dummy_array, &
-              xstore_inner_core,ystore_inner_core,zstore_inner_core, &
-              xix_inner_core,xiy_inner_core,xiz_inner_core, &
-              etax_inner_core,etay_inner_core,etaz_inner_core, &
-              gammax_inner_core,gammay_inner_core,gammaz_inner_core, &
-              rhostore_inner_core,kappavstore_inner_core,muvstore_inner_core, &
-              dummy_array,dummy_array,dummy_array, &
-              c11store_inner_core,c12store_inner_core,c13store_inner_core, &
-              dummy_array,dummy_array,dummy_array, &
-              dummy_array,dummy_array,dummy_array, &
-              dummy_array,dummy_array,c33store_inner_core, &
-              dummy_array,dummy_array,dummy_array, &
-              c44store_inner_core,dummy_array,dummy_array, &
-              dummy_array,dummy_array,dummy_array, &
-              ibool_inner_core,idoubling_inner_core,dummy_ispec_is_tiso, &
-              rmassx_inner_core,rmassy_inner_core,rmass_inner_core,rmass_ocean_load, &
-              READ_KAPPA_MU,READ_TISO, &
-              b_rmassx_inner_core,b_rmassy_inner_core)
-  endif
-
-  deallocate(dummy_ispec_is_tiso)
-
-  ! check that the number of points in this slice is correct
-  if(minval(ibool_inner_core(:,:,:,:)) /= 1 .or. maxval(ibool_inner_core(:,:,:,:)) /= NGLOB_INNER_CORE) &
-    call exit_MPI(myrank,'incorrect global numbering: iboolmax does not equal nglob in inner core')
-
-  end subroutine read_mesh_databases_IC
-
-!
-!-------------------------------------------------------------------------------------------------
-!
-
-  subroutine read_mesh_databases_coupling()
-
-! to couple mantle with outer core
-
-  use specfem_par
-  use specfem_par_crustmantle
-  use specfem_par_innercore
-  use specfem_par_outercore
-
-  implicit none
-
-  ! local parameters
-  integer :: njunk1,njunk2,njunk3
-  integer :: ier
-
-  ! user output
-  if( myrank == 0 ) write(IMAIN,*) 'reading coupling surfaces...'
-
-  ! crust and mantle
-  ! create name of database
-  call create_name_database(prname,myrank,IREGION_CRUST_MANTLE,LOCAL_PATH)
-
-  ! Stacey put back
-  open(unit=27,file=prname(1:len_trim(prname))//'boundary.bin', &
-        status='old',form='unformatted',action='read',iostat=ier)
-  if( ier /= 0 ) call exit_mpi(myrank,'error opening crust_mantle boundary.bin file')
-
-  read(27) nspec2D_xmin_crust_mantle
-  read(27) nspec2D_xmax_crust_mantle
-  read(27) nspec2D_ymin_crust_mantle
-  read(27) nspec2D_ymax_crust_mantle
-  read(27) njunk1
-  read(27) njunk2
-
-! boundary parameters
-  read(27) ibelm_xmin_crust_mantle
-  read(27) ibelm_xmax_crust_mantle
-  read(27) ibelm_ymin_crust_mantle
-  read(27) ibelm_ymax_crust_mantle
-  read(27) ibelm_bottom_crust_mantle
-  read(27) ibelm_top_crust_mantle
-
-  read(27) normal_xmin_crust_mantle
-  read(27) normal_xmax_crust_mantle
-  read(27) normal_ymin_crust_mantle
-  read(27) normal_ymax_crust_mantle
-  read(27) normal_bottom_crust_mantle
-  read(27) normal_top_crust_mantle
-
-  read(27) jacobian2D_xmin_crust_mantle
-  read(27) jacobian2D_xmax_crust_mantle
-  read(27) jacobian2D_ymin_crust_mantle
-  read(27) jacobian2D_ymax_crust_mantle
-  read(27) jacobian2D_bottom_crust_mantle
-  read(27) jacobian2D_top_crust_mantle
-  close(27)
-
-  ! read parameters to couple fluid and solid regions
-  !
-  ! outer core
-
-  ! create name of database
-  call create_name_database(prname,myrank,IREGION_OUTER_CORE,LOCAL_PATH)
-
-  ! boundary parameters
-
-  ! Stacey put back
-  open(unit=27,file=prname(1:len_trim(prname))//'boundary.bin', &
-        status='old',form='unformatted',action='read',iostat=ier)
-  if( ier /= 0 ) call exit_mpi(myrank,'error opening outer_core boundary.bin file')
-
-  read(27) nspec2D_xmin_outer_core
-  read(27) nspec2D_xmax_outer_core
-  read(27) nspec2D_ymin_outer_core
-  read(27) nspec2D_ymax_outer_core
-  read(27) njunk1
-  read(27) njunk2
-
-  nspec2D_zmin_outer_core = NSPEC2D_BOTTOM(IREGION_OUTER_CORE)
-
-  read(27) ibelm_xmin_outer_core
-  read(27) ibelm_xmax_outer_core
-  read(27) ibelm_ymin_outer_core
-  read(27) ibelm_ymax_outer_core
-  read(27) ibelm_bottom_outer_core
-  read(27) ibelm_top_outer_core
-
-  read(27) normal_xmin_outer_core
-  read(27) normal_xmax_outer_core
-  read(27) normal_ymin_outer_core
-  read(27) normal_ymax_outer_core
-  read(27) normal_bottom_outer_core
-  read(27) normal_top_outer_core
-
-  read(27) jacobian2D_xmin_outer_core
-  read(27) jacobian2D_xmax_outer_core
-  read(27) jacobian2D_ymin_outer_core
-  read(27) jacobian2D_ymax_outer_core
-  read(27) jacobian2D_bottom_outer_core
-  read(27) jacobian2D_top_outer_core
-  close(27)
-
-  !
-  ! inner core
-  !
-
-  ! create name of database
-  call create_name_database(prname,myrank,IREGION_INNER_CORE,LOCAL_PATH)
-
-  ! read info for vertical edges for central cube matching in inner core
-  open(unit=27,file=prname(1:len_trim(prname))//'boundary.bin', &
-        status='old',form='unformatted',action='read',iostat=ier)
-  if( ier /= 0 ) call exit_mpi(myrank,'error opening inner_core boundary.bin file')
-
-  read(27) nspec2D_xmin_inner_core
-  read(27) nspec2D_xmax_inner_core
-  read(27) nspec2D_ymin_inner_core
-  read(27) nspec2D_ymax_inner_core
-  read(27) njunk1
-  read(27) njunk2
-
-  ! boundary parameters
-  read(27) ibelm_xmin_inner_core
-  read(27) ibelm_xmax_inner_core
-  read(27) ibelm_ymin_inner_core
-  read(27) ibelm_ymax_inner_core
-  read(27) ibelm_bottom_inner_core
-  read(27) ibelm_top_inner_core
-  close(27)
-
-  ! -- Boundary Mesh for crust and mantle ---
-  if (SAVE_BOUNDARY_MESH .and. SIMULATION_TYPE == 3) then
-
-    call create_name_database(prname,myrank,IREGION_CRUST_MANTLE,LOCAL_PATH)
-
-    open(unit=27,file=prname(1:len_trim(prname))//'boundary_disc.bin', &
-          status='old',form='unformatted',action='read',iostat=ier)
-    if( ier /= 0 ) call exit_mpi(myrank,'error opening boundary_disc.bin file')
-
-    read(27) njunk1,njunk2,njunk3
-    if (njunk1 /= NSPEC2D_MOHO .and. njunk2 /= NSPEC2D_400 .and. njunk3 /= NSPEC2D_670) &
-               call exit_mpi(myrank, 'Error reading ibelm_disc.bin file')
-    read(27) ibelm_moho_top
-    read(27) ibelm_moho_bot
-    read(27) ibelm_400_top
-    read(27) ibelm_400_bot
-    read(27) ibelm_670_top
-    read(27) ibelm_670_bot
-    read(27) normal_moho
-    read(27) normal_400
-    read(27) normal_670
-    close(27)
-
-    k_top = 1
-    k_bot = NGLLZ
-
-    ! initialization
-    moho_kl = 0.; d400_kl = 0.; d670_kl = 0.; cmb_kl = 0.; icb_kl = 0.
-  endif
-
-  end subroutine read_mesh_databases_coupling
-
-!
-!-------------------------------------------------------------------------------------------------
-!
-
-  subroutine read_mesh_databases_addressing()
-
-  use specfem_par
-  use specfem_par_crustmantle
-  use specfem_par_innercore
-  use specfem_par_outercore
-
-  implicit none
-
-  ! local parameters
-  integer, dimension(NCHUNKS_VAL,0:NPROC_XI_VAL-1,0:NPROC_ETA_VAL-1) :: addressing
-  integer, dimension(0:NPROCTOT_VAL-1) :: ichunk_slice,iproc_xi_slice,iproc_eta_slice
-  integer :: ier,iproc,iproc_read,iproc_xi,iproc_eta
-
-  ! open file with global slice number addressing
-  if(myrank == 0) then
-    open(unit=IIN,file=trim(OUTPUT_FILES)//'/addressing.txt',status='old',action='read',iostat=ier)
-    if( ier /= 0 ) call exit_mpi(myrank,'error opening addressing.txt')
-
-    do iproc = 0,NPROCTOT_VAL-1
-      read(IIN,*) iproc_read,ichunk,iproc_xi,iproc_eta
-
-      if(iproc_read /= iproc) call exit_MPI(myrank,'incorrect slice number read')
-
-      addressing(ichunk,iproc_xi,iproc_eta) = iproc
-      ichunk_slice(iproc) = ichunk
-      iproc_xi_slice(iproc) = iproc_xi
-      iproc_eta_slice(iproc) = iproc_eta
-    enddo
-    close(IIN)
-  endif
-
-  ! broadcast the information read on the master to the nodes
-  call bcast_all_i(addressing,NCHUNKS_VAL*NPROC_XI_VAL*NPROC_ETA_VAL)
-  call bcast_all_i(ichunk_slice,NPROCTOT_VAL)
-  call bcast_all_i(iproc_xi_slice,NPROCTOT_VAL)
-  call bcast_all_i(iproc_eta_slice,NPROCTOT_VAL)
-
-  ! output a topology map of slices - fix 20x by nproc
-  if (myrank == 0 ) then
-    if( NCHUNKS_VAL == 6 .and. NPROCTOT_VAL < 1000 ) then
-      write(IMAIN,*) 'Spatial distribution of the slices'
-      do iproc_xi = NPROC_XI_VAL-1, 0, -1
-        write(IMAIN,'(20x)',advance='no')
-        do iproc_eta = NPROC_ETA_VAL -1, 0, -1
-          ichunk = CHUNK_AB
-          write(IMAIN,'(i5)',advance='no') addressing(ichunk,iproc_xi,iproc_eta)
-        enddo
-        write(IMAIN,'(1x)',advance='yes')
-      enddo
-      write(IMAIN, *) ' '
-      do iproc_xi = NPROC_XI_VAL-1, 0, -1
-        write(IMAIN,'(1x)',advance='no')
-        do iproc_eta = NPROC_ETA_VAL -1, 0, -1
-          ichunk = CHUNK_BC
-          write(IMAIN,'(i5)',advance='no') addressing(ichunk,iproc_xi,iproc_eta)
-        enddo
-        write(IMAIN,'(3x)',advance='no')
-        do iproc_eta = NPROC_ETA_VAL -1, 0, -1
-          ichunk = CHUNK_AC
-          write(IMAIN,'(i5)',advance='no') addressing(ichunk,iproc_xi,iproc_eta)
-        enddo
-        write(IMAIN,'(3x)',advance='no')
-        do iproc_eta = NPROC_ETA_VAL -1, 0, -1
-          ichunk = CHUNK_BC_ANTIPODE
-          write(IMAIN,'(i5)',advance='no') addressing(ichunk,iproc_xi,iproc_eta)
-        enddo
-        write(IMAIN,'(1x)',advance='yes')
-      enddo
-      write(IMAIN, *) ' '
-      do iproc_xi = NPROC_XI_VAL-1, 0, -1
-        write(IMAIN,'(20x)',advance='no')
-        do iproc_eta = NPROC_ETA_VAL -1, 0, -1
-          ichunk = CHUNK_AB_ANTIPODE
-          write(IMAIN,'(i5)',advance='no') addressing(ichunk,iproc_xi,iproc_eta)
-        enddo
-        write(IMAIN,'(1x)',advance='yes')
-      enddo
-      write(IMAIN, *) ' '
-      do iproc_xi = NPROC_XI_VAL-1, 0, -1
-        write(IMAIN,'(20x)',advance='no')
-        do iproc_eta = NPROC_ETA_VAL -1, 0, -1
-          ichunk = CHUNK_AC_ANTIPODE
-          write(IMAIN,'(i5)',advance='no') addressing(ichunk,iproc_xi,iproc_eta)
-        enddo
-        write(IMAIN,'(1x)',advance='yes')
-      enddo
-      write(IMAIN, *) ' '
-    endif
-  endif
-
-  ! determine chunk number and local slice coordinates using addressing
-  ! (needed for Stacey conditions)
-  ichunk = ichunk_slice(myrank)
-
-  end subroutine read_mesh_databases_addressing
-
-
-!
-!-------------------------------------------------------------------------------------------------
-!
-
-  subroutine read_mesh_databases_MPI()
-
-  use specfem_par
-  use specfem_par_crustmantle
-  use specfem_par_innercore
-  use specfem_par_outercore
-  implicit none
-
-  ! local parameters
-  real :: percentage_edge
-  integer :: ier
-
-  ! read MPI interfaces from file
-
-  ! crust mantle
-  if( ADIOS_ENABLED .and. ADIOS_FOR_MPI_ARRAYS ) then
-    call read_mesh_databases_MPI_CM_adios()
-  else
-    call read_mesh_databases_MPI_CM()
-  endif
-
-  allocate(buffer_send_vector_crust_mantle(NDIM,max_nibool_interfaces_cm,num_interfaces_crust_mantle), &
-          buffer_recv_vector_crust_mantle(NDIM,max_nibool_interfaces_cm,num_interfaces_crust_mantle), &
-          request_send_vector_cm(num_interfaces_crust_mantle), &
-          request_recv_vector_cm(num_interfaces_crust_mantle), &
-          stat=ier)
-  if( ier /= 0 ) call exit_mpi(myrank,'error allocating array buffer_send_vector_crust_mantle etc.')
-
-  if( SIMULATION_TYPE == 3 ) then
-    allocate(b_buffer_send_vector_cm(NDIM,max_nibool_interfaces_cm,num_interfaces_crust_mantle), &
-            b_buffer_recv_vector_cm(NDIM,max_nibool_interfaces_cm,num_interfaces_crust_mantle), &
-            b_request_send_vector_cm(num_interfaces_crust_mantle), &
-            b_request_recv_vector_cm(num_interfaces_crust_mantle), &
-            stat=ier)
-    if( ier /= 0 ) call exit_mpi(myrank,'error allocating array b_buffer_send_vector_cm etc.')
-  endif
-
-  ! outer core
-  if( ADIOS_ENABLED .and. ADIOS_FOR_MPI_ARRAYS ) then
-    call read_mesh_databases_MPI_OC_adios()
-  else
-    call read_mesh_databases_MPI_OC()
-  endif
-
-  allocate(buffer_send_scalar_outer_core(max_nibool_interfaces_oc,num_interfaces_outer_core), &
-          buffer_recv_scalar_outer_core(max_nibool_interfaces_oc,num_interfaces_outer_core), &
-          request_send_scalar_oc(num_interfaces_outer_core), &
-          request_recv_scalar_oc(num_interfaces_outer_core), &
-          stat=ier)
-  if( ier /= 0 ) call exit_mpi(myrank,'error allocating array buffer_send_vector_outer_core etc.')
-
-  if( SIMULATION_TYPE == 3 ) then
-    allocate(b_buffer_send_scalar_outer_core(max_nibool_interfaces_oc,num_interfaces_outer_core), &
-            b_buffer_recv_scalar_outer_core(max_nibool_interfaces_oc,num_interfaces_outer_core), &
-            b_request_send_scalar_oc(num_interfaces_outer_core), &
-            b_request_recv_scalar_oc(num_interfaces_outer_core), &
-            stat=ier)
-    if( ier /= 0 ) call exit_mpi(myrank,'error allocating array b_buffer_send_vector_outer_core etc.')
-  endif
-
-  ! inner core
-  if( ADIOS_ENABLED .and. ADIOS_FOR_MPI_ARRAYS ) then
-    call read_mesh_databases_MPI_IC_adios()
-  else
-    call read_mesh_databases_MPI_IC()
-  endif
-
-  allocate(buffer_send_vector_inner_core(NDIM,max_nibool_interfaces_ic,num_interfaces_inner_core), &
-          buffer_recv_vector_inner_core(NDIM,max_nibool_interfaces_ic,num_interfaces_inner_core), &
-          request_send_vector_ic(num_interfaces_inner_core), &
-          request_recv_vector_ic(num_interfaces_inner_core), &
-          stat=ier)
-  if( ier /= 0 ) call exit_mpi(myrank,'error allocating array buffer_send_vector_inner_core etc.')
-
-  if( SIMULATION_TYPE == 3 ) then
-    allocate(b_buffer_send_vector_inner_core(NDIM,max_nibool_interfaces_ic,num_interfaces_inner_core), &
-            b_buffer_recv_vector_inner_core(NDIM,max_nibool_interfaces_ic,num_interfaces_inner_core), &
-            b_request_send_vector_ic(num_interfaces_inner_core), &
-            b_request_recv_vector_ic(num_interfaces_inner_core), &
-            stat=ier)
-    if( ier /= 0 ) call exit_mpi(myrank,'error allocating array b_buffer_send_vector_inner_core etc.')
-  endif
-
-
-  ! user output
-  if(myrank == 0) then
-    write(IMAIN,*) 'for overlapping of communications with calculations:'
-    write(IMAIN,*)
-
-    percentage_edge = 100. * nspec_outer_crust_mantle / real(NSPEC_CRUST_MANTLE)
-    write(IMAIN,*) 'percentage of edge elements in crust/mantle ',percentage_edge,'%'
-    write(IMAIN,*) 'percentage of volume elements in crust/mantle ',100. - percentage_edge,'%'
-    write(IMAIN,*)
-
-    percentage_edge = 100.* nspec_outer_outer_core / real(NSPEC_OUTER_CORE)
-    write(IMAIN,*) 'percentage of edge elements in outer core ',percentage_edge,'%'
-    write(IMAIN,*) 'percentage of volume elements in outer core ',100. - percentage_edge,'%'
-    write(IMAIN,*)
-
-    percentage_edge = 100. * nspec_outer_inner_core / real(NSPEC_INNER_CORE)
-    write(IMAIN,*) 'percentage of edge elements in inner core ',percentage_edge,'%'
-    write(IMAIN,*) 'percentage of volume elements in inner core ',100. - percentage_edge,'%'
-    write(IMAIN,*)
-    call flush_IMAIN()
-  endif
-  ! synchronizes MPI processes
-  call sync_all()
-
-  end subroutine read_mesh_databases_MPI
-
-!
-!-------------------------------------------------------------------------------------------------
-!
-
-  subroutine read_mesh_databases_MPI_CM()
-
-  use specfem_par
-  use specfem_par_crustmantle
-  implicit none
-
-  ! local parameters
-  integer :: ier
-
-  ! crust mantle region
-
-  ! create the name for the database of the current slide and region
-  call create_name_database(prname,myrank,IREGION_CRUST_MANTLE,LOCAL_PATH)
-
-  open(unit=IIN,file=prname(1:len_trim(prname))//'solver_data_mpi.bin', &
-       status='old',action='read',form='unformatted',iostat=ier)
-  if( ier /= 0 ) call exit_mpi(myrank,'error opening solver_data_mpi.bin')
-
-  ! MPI interfaces
-  read(IIN) num_interfaces_crust_mantle
-  allocate(my_neighbours_crust_mantle(num_interfaces_crust_mantle), &
-          nibool_interfaces_crust_mantle(num_interfaces_crust_mantle), &
-          stat=ier)
-  if( ier /= 0 ) &
-    call exit_mpi(myrank,'error allocating array my_neighbours_crust_mantle etc.')
-
-  if( num_interfaces_crust_mantle > 0 ) then
-    read(IIN) max_nibool_interfaces_cm
-    allocate(ibool_interfaces_crust_mantle(max_nibool_interfaces_cm,num_interfaces_crust_mantle), &
-            stat=ier)
-    if( ier /= 0 ) call exit_mpi(myrank,'error allocating array ibool_interfaces_crust_mantle')
-
-    read(IIN) my_neighbours_crust_mantle
-    read(IIN) nibool_interfaces_crust_mantle
-    read(IIN) ibool_interfaces_crust_mantle
-  else
-    ! dummy array
-    max_nibool_interfaces_cm = 0
-    allocate(ibool_interfaces_crust_mantle(0,0),stat=ier)
-    if( ier /= 0 ) call exit_mpi(myrank,'error allocating array dummy ibool_interfaces_crust_mantle')
-  endif
-
-  ! inner / outer elements
-  read(IIN) nspec_inner_crust_mantle,nspec_outer_crust_mantle
-  read(IIN) num_phase_ispec_crust_mantle
-  if( num_phase_ispec_crust_mantle < 0 ) &
-    call exit_mpi(myrank,'error num_phase_ispec_crust_mantle is < zero')
-
-  allocate(phase_ispec_inner_crust_mantle(num_phase_ispec_crust_mantle,2),&
-          stat=ier)
-  if( ier /= 0 ) &
-    call exit_mpi(myrank,'error allocating array phase_ispec_inner_crust_mantle')
-
-  if(num_phase_ispec_crust_mantle > 0 ) read(IIN) phase_ispec_inner_crust_mantle
-
-  ! mesh coloring for GPUs
-  if( USE_MESH_COLORING_GPU ) then
-    ! colors
-    read(IIN) num_colors_outer_crust_mantle,num_colors_inner_crust_mantle
-
-    allocate(num_elem_colors_crust_mantle(num_colors_outer_crust_mantle + num_colors_inner_crust_mantle), &
-            stat=ier)
-    if( ier /= 0 ) &
-      call exit_mpi(myrank,'error allocating num_elem_colors_crust_mantle array')
-
-    read(IIN) num_elem_colors_crust_mantle
-  else
-    ! allocates dummy arrays
-    num_colors_outer_crust_mantle = 0
-    num_colors_inner_crust_mantle = 0
-    allocate(num_elem_colors_crust_mantle(num_colors_outer_crust_mantle + num_colors_inner_crust_mantle), &
-            stat=ier)
-    if( ier /= 0 ) &
-      call exit_mpi(myrank,'error allocating num_elem_colors_crust_mantle array')
-  endif
-
-  close(IIN)
-
-  end subroutine read_mesh_databases_MPI_CM
-
-
-!
-!-------------------------------------------------------------------------------------------------
-!
-
-  subroutine read_mesh_databases_MPI_OC()
-
-  use specfem_par
-  use specfem_par_outercore
-  implicit none
-
-  ! local parameters
-  integer :: ier
-
-  ! crust mantle region
-
-  ! create the name for the database of the current slide and region
-  call create_name_database(prname,myrank,IREGION_OUTER_CORE,LOCAL_PATH)
-
-  open(unit=IIN,file=prname(1:len_trim(prname))//'solver_data_mpi.bin', &
-       status='old',action='read',form='unformatted',iostat=ier)
-  if( ier /= 0 ) call exit_mpi(myrank,'error opening solver_data_mpi.bin')
-
-  ! MPI interfaces
-  read(IIN) num_interfaces_outer_core
-  allocate(my_neighbours_outer_core(num_interfaces_outer_core), &
-          nibool_interfaces_outer_core(num_interfaces_outer_core), &
-          stat=ier)
-  if( ier /= 0 ) &
-    call exit_mpi(myrank,'error allocating array my_neighbours_outer_core etc.')
-
-  if( num_interfaces_outer_core > 0 ) then
-    read(IIN) max_nibool_interfaces_oc
-    allocate(ibool_interfaces_outer_core(max_nibool_interfaces_oc,num_interfaces_outer_core), &
-            stat=ier)
-    if( ier /= 0 ) call exit_mpi(myrank,'error allocating array ibool_interfaces_outer_core')
-
-    read(IIN) my_neighbours_outer_core
-    read(IIN) nibool_interfaces_outer_core
-    read(IIN) ibool_interfaces_outer_core
-  else
-    ! dummy array
-    max_nibool_interfaces_oc = 0
-    allocate(ibool_interfaces_outer_core(0,0),stat=ier)
-    if( ier /= 0 ) call exit_mpi(myrank,'error allocating array dummy ibool_interfaces_outer_core')
-  endif
-
-  ! inner / outer elements
-  read(IIN) nspec_inner_outer_core,nspec_outer_outer_core
-  read(IIN) num_phase_ispec_outer_core
-  if( num_phase_ispec_outer_core < 0 ) &
-    call exit_mpi(myrank,'error num_phase_ispec_outer_core is < zero')
-
-  allocate(phase_ispec_inner_outer_core(num_phase_ispec_outer_core,2),&
-          stat=ier)
-  if( ier /= 0 ) &
-    call exit_mpi(myrank,'error allocating array phase_ispec_inner_outer_core')
-
-  if(num_phase_ispec_outer_core > 0 ) read(IIN) phase_ispec_inner_outer_core
-
-  ! mesh coloring for GPUs
-  if( USE_MESH_COLORING_GPU ) then
-    ! colors
-    read(IIN) num_colors_outer_outer_core,num_colors_inner_outer_core
-
-    allocate(num_elem_colors_outer_core(num_colors_outer_outer_core + num_colors_inner_outer_core), &
-            stat=ier)
-    if( ier /= 0 ) &
-      call exit_mpi(myrank,'error allocating num_elem_colors_outer_core array')
-
-    read(IIN) num_elem_colors_outer_core
-  else
-    ! allocates dummy arrays
-    num_colors_outer_outer_core = 0
-    num_colors_inner_outer_core = 0
-    allocate(num_elem_colors_outer_core(num_colors_outer_outer_core + num_colors_inner_outer_core), &
-            stat=ier)
-    if( ier /= 0 ) &
-      call exit_mpi(myrank,'error allocating num_elem_colors_outer_core array')
-  endif
-
-  close(IIN)
-
-  end subroutine read_mesh_databases_MPI_OC
-
-!
-!-------------------------------------------------------------------------------------------------
-!
-
-  subroutine read_mesh_databases_MPI_IC()
-
-  use specfem_par
-  use specfem_par_innercore
-  implicit none
-
-  ! local parameters
-  integer :: ier
-
-  ! crust mantle region
-
-  ! create the name for the database of the current slide and region
-  call create_name_database(prname,myrank,IREGION_INNER_CORE,LOCAL_PATH)
-
-  open(unit=IIN,file=prname(1:len_trim(prname))//'solver_data_mpi.bin', &
-       status='old',action='read',form='unformatted',iostat=ier)
-  if( ier /= 0 ) call exit_mpi(myrank,'error opening solver_data_mpi.bin')
-
-  ! MPI interfaces
-  read(IIN) num_interfaces_inner_core
-  allocate(my_neighbours_inner_core(num_interfaces_inner_core), &
-          nibool_interfaces_inner_core(num_interfaces_inner_core), &
-          stat=ier)
-  if( ier /= 0 ) &
-    call exit_mpi(myrank,'error allocating array my_neighbours_inner_core etc.')
-
-  if( num_interfaces_inner_core > 0 ) then
-    read(IIN) max_nibool_interfaces_ic
-    allocate(ibool_interfaces_inner_core(max_nibool_interfaces_ic,num_interfaces_inner_core), &
-            stat=ier)
-    if( ier /= 0 ) call exit_mpi(myrank,'error allocating array ibool_interfaces_inner_core')
-
-    read(IIN) my_neighbours_inner_core
-    read(IIN) nibool_interfaces_inner_core
-    read(IIN) ibool_interfaces_inner_core
-  else
-    ! dummy array
-    max_nibool_interfaces_ic = 0
-    allocate(ibool_interfaces_inner_core(0,0),stat=ier)
-    if( ier /= 0 ) call exit_mpi(myrank,'error allocating array dummy ibool_interfaces_inner_core')
-  endif
-
-  ! inner / outer elements
-  read(IIN) nspec_inner_inner_core,nspec_outer_inner_core
-  read(IIN) num_phase_ispec_inner_core
-  if( num_phase_ispec_inner_core < 0 ) &
-    call exit_mpi(myrank,'error num_phase_ispec_inner_core is < zero')
-
-  allocate(phase_ispec_inner_inner_core(num_phase_ispec_inner_core,2),&
-          stat=ier)
-  if( ier /= 0 ) &
-    call exit_mpi(myrank,'error allocating array phase_ispec_inner_inner_core')
-
-  if(num_phase_ispec_inner_core > 0 ) read(IIN) phase_ispec_inner_inner_core
-
-  ! mesh coloring for GPUs
-  if( USE_MESH_COLORING_GPU ) then
-    ! colors
-    read(IIN) num_colors_outer_inner_core,num_colors_inner_inner_core
-
-    allocate(num_elem_colors_inner_core(num_colors_outer_inner_core + num_colors_inner_inner_core), &
-            stat=ier)
-    if( ier /= 0 ) &
-      call exit_mpi(myrank,'error allocating num_elem_colors_inner_core array')
-
-    read(IIN) num_elem_colors_inner_core
-  else
-    ! allocates dummy arrays
-    num_colors_outer_inner_core = 0
-    num_colors_inner_inner_core = 0
-    allocate(num_elem_colors_inner_core(num_colors_outer_inner_core + num_colors_inner_inner_core), &
-            stat=ier)
-    if( ier /= 0 ) &
-      call exit_mpi(myrank,'error allocating num_elem_colors_inner_core array')
-  endif
-
-  close(IIN)
-
-  end subroutine read_mesh_databases_MPI_IC
-
-
-!
-!-------------------------------------------------------------------------------------------------
-!
-
-  subroutine read_mesh_databases_stacey()
-
-  use specfem_par
-  use specfem_par_crustmantle
-  use specfem_par_innercore
-  use specfem_par_outercore
-
-  implicit none
-
-  ! local parameters
-  integer :: ier
-
-  ! crust and mantle
-
-  ! create name of database
-  call create_name_database(prname,myrank,IREGION_CRUST_MANTLE,LOCAL_PATH)
-
-  ! read arrays for Stacey conditions
-  open(unit=27,file=prname(1:len_trim(prname))//'stacey.bin', &
-        status='old',form='unformatted',action='read',iostat=ier)
-  if( ier /= 0 ) call exit_MPI(myrank,'error opening stacey.bin file for crust mantle')
-
-  read(27) nimin_crust_mantle
-  read(27) nimax_crust_mantle
-  read(27) njmin_crust_mantle
-  read(27) njmax_crust_mantle
-  read(27) nkmin_xi_crust_mantle
-  read(27) nkmin_eta_crust_mantle
-  close(27)
-
-  ! outer core
-
-  ! create name of database
-  call create_name_database(prname,myrank,IREGION_OUTER_CORE,LOCAL_PATH)
-
-  ! read arrays for Stacey conditions
-  open(unit=27,file=prname(1:len_trim(prname))//'stacey.bin', &
-        status='old',form='unformatted',action='read',iostat=ier)
-  if( ier /= 0 ) call exit_MPI(myrank,'error opening stacey.bin file for outer core')
-
-  read(27) nimin_outer_core
-  read(27) nimax_outer_core
-  read(27) njmin_outer_core
-  read(27) njmax_outer_core
-  read(27) nkmin_xi_outer_core
-  read(27) nkmin_eta_outer_core
-  close(27)
-
-  end subroutine read_mesh_databases_stacey

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/rules.mk
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/rules.mk	2013-08-30 00:30:39 UTC (rev 22745)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/rules.mk	2013-08-30 12:45:44 UTC (rev 22746)
@@ -236,7 +236,7 @@
 ${E}/xspecfem3D: $(XSPECFEM_OBJECTS)
 	@echo "building xspecfem3D with CUDA 5 support"
 	${NVCCLINK} -o $(cuda_DEVICE_OBJ) $(cuda_OBJECTS)
-  ${FCLINK} -o ${E}/xspecfem3D $(XSPECFEM_OBJECTS) $(cuda_DEVICE_OBJ) $(MPILIBS) $(CUDA_LINK)
+	${FCLINK} -o ${E}/xspecfem3D $(XSPECFEM_OBJECTS) $(cuda_DEVICE_OBJ) $(MPILIBS) $(CUDA_LINK)
 else
 
 ## cuda 4 version

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/save_regular_kernels.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/save_regular_kernels.f90	2013-08-30 00:30:39 UTC (rev 22745)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/save_regular_kernels.f90	2013-08-30 12:45:44 UTC (rev 22746)
@@ -25,54 +25,13 @@
 !
 !=====================================================================
 
-  subroutine save_regular_kernels_crust_mantle(myrank, &
-                  npoints_slice, hxir_reg, hetar_reg, hgammar_reg, ispec_reg, &
-                  scale_t,scale_displ, &
-                  cijkl_kl_crust_mantle,rho_kl_crust_mantle, &
-                  alpha_kl_crust_mantle,beta_kl_crust_mantle, &
-                  ystore_crust_mantle,zstore_crust_mantle, &
-                  rhostore_crust_mantle,muvstore_crust_mantle, &
-                  kappavstore_crust_mantle,ibool_crust_mantle, &
-                  kappahstore_crust_mantle,muhstore_crust_mantle, &
-                  eta_anisostore_crust_mantle,ispec_is_tiso_crust_mantle, &
-                  LOCAL_PATH)
+  subroutine save_regular_kernels_crust_mantle()
 
-  use constants_solver
-  use specfem_par,only: ANISOTROPIC_KL,SAVE_TRANSVERSE_KL_ONLY
+  use specfem_par
+  use specfem_par_crustmantle
 
   implicit none
 
-  integer myrank
-
-  integer, intent(in) :: npoints_slice
-  real, dimension(NGLLX, NM_KL_REG_PTS_VAL), intent(in) :: hxir_reg
-  real, dimension(NGLLY, NM_KL_REG_PTS_VAL), intent(in) :: hetar_reg
-  real, dimension(NGLLZ, NM_KL_REG_PTS_VAL), intent(in) :: hgammar_reg
-  integer, dimension(NM_KL_REG_PTS_VAL), intent(in) :: ispec_reg
-
-  double precision :: scale_t,scale_displ
-
-  real(kind=CUSTOM_REAL), dimension(21,NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_ADJOINT_ANISO_KL) :: &
-    cijkl_kl_crust_mantle
-
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_ADJOINT) :: &
-    rho_kl_crust_mantle, beta_kl_crust_mantle, alpha_kl_crust_mantle
-
-  real(kind=CUSTOM_REAL), dimension(NGLOB_CRUST_MANTLE) :: &
-        ystore_crust_mantle,zstore_crust_mantle
-
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPECMAX_ISO_MANTLE) :: &
-        rhostore_crust_mantle,kappavstore_crust_mantle,muvstore_crust_mantle
-
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPECMAX_TISO_MANTLE) :: &
-        kappahstore_crust_mantle,muhstore_crust_mantle,eta_anisostore_crust_mantle
-
-  logical, dimension(NSPEC_CRUST_MANTLE) :: ispec_is_tiso_crust_mantle
-
-  integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE) :: ibool_crust_mantle
-
-  character(len=150) LOCAL_PATH
-
   ! local parameters
   real(kind=CUSTOM_REAL), dimension(:,:), allocatable :: &
     cijkl_kl_crust_mantle_reg
@@ -85,7 +44,7 @@
   real(kind=CUSTOM_REAL) :: rhol,mul,kappal,rho_kl,alpha_kl,beta_kl
   real(kind=CUSTOM_REAL) :: alphah_kl,alphav_kl,betah_kl,betav_kl,rhonotprime_kl
   integer :: ispec,i,j,k,iglob
-  character(len=150) prname
+!  character(len=150) prname
   double precision :: hlagrange
   integer :: ipoint
 

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/setup_GLL_points.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/setup_GLL_points.f90	2013-08-30 00:30:39 UTC (rev 22745)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/setup_GLL_points.f90	2013-08-30 12:45:44 UTC (rev 22746)
@@ -28,10 +28,11 @@
   subroutine setup_GLL_points()
 
   use specfem_par
+
   implicit none
 
   ! local parameters
-  integer :: i,j
+  integer :: i,j,k
 
   ! set up GLL points, weights and derivation matrices
   call define_derivation_matrices(xigll,yigll,zigll,wxgll,wygll,wzgll, &
@@ -39,9 +40,21 @@
                                  hprimewgll_xx,hprimewgll_yy,hprimewgll_zz, &
                                  wgllwgll_xy,wgllwgll_xz,wgllwgll_yz,wgll_cube)
 
+  ! define a 3D extension in order to be able to force vectorization in the compute_forces routines
+  if( FORCE_VECTORIZATION_VAL ) then
+    do k = 1,NGLLZ
+      do j = 1,NGLLY
+        do i = 1,NGLLX
+          wgllwgll_yz_3D(i,j,k) = wgllwgll_yz(j,k)
+          wgllwgll_xz_3D(i,j,k) = wgllwgll_xz(i,k)
+          wgllwgll_xy_3D(i,j,k) = wgllwgll_xy(i,j)
+        enddo
+      enddo
+    enddo
+  endif
+
   if( USE_DEVILLE_PRODUCTS_VAL ) then
-
-  ! check that optimized routines from Deville et al. (2002) can be used
+    ! check that optimized routines from Deville et al. (2002) can be used
     if(NGLLX /= 5 .or. NGLLY /= 5 .or. NGLLZ /= 5) &
       stop 'Deville et al. (2002) routines can only be used if NGLLX = NGLLY = NGLLZ = 5'
 

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/specfem3D_par.F90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/specfem3D_par.F90	2013-08-30 00:30:39 UTC (rev 22745)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/specfem3D_par.F90	2013-08-30 12:45:44 UTC (rev 22746)
@@ -72,6 +72,8 @@
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLZ) :: wgllwgll_xz
   real(kind=CUSTOM_REAL), dimension(NGLLY,NGLLZ) :: wgllwgll_yz
 
+  ! arrays for force_vectorization
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: wgllwgll_xy_3D,wgllwgll_xz_3D,wgllwgll_yz_3D
 
   !-----------------------------------------------------------------
   ! attenuation parameters
@@ -278,6 +280,21 @@
   ! ADJOINT
   real(kind=CUSTOM_REAL) :: b_deltat,b_deltatover2,b_deltatsqover2
 
+  ! this is for LDDRK
+  real(kind=CUSTOM_REAL), dimension(:,:), allocatable :: displ_crust_mantle_lddrk,veloc_crust_mantle_lddrk
+  real(kind=CUSTOM_REAL), dimension(:), allocatable :: displ_outer_core_lddrk,veloc_outer_core_lddrk
+  real(kind=CUSTOM_REAL), dimension(:,:), allocatable :: displ_inner_core_lddrk,veloc_inner_core_lddrk
+
+  real(kind=CUSTOM_REAL), dimension(:,:,:,:), allocatable :: A_array_rotation_lddrk,B_array_rotation_lddrk
+  real(kind=CUSTOM_REAL),dimension(:,:,:,:,:,:), allocatable :: R_memory_crust_mantle_lddrk
+  real(kind=CUSTOM_REAL),dimension(:,:,:,:,:,:), allocatable :: R_memory_inner_core_lddrk
+
+  integer :: NSTAGE_TIME_SCHEME,istage
+
+#ifdef USE_SERIAL_CASCADE_FOR_IOs
+  logical :: you_can_start_doing_IOs
+#endif
+
 end module specfem_par
 
 
@@ -411,7 +428,7 @@
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_STACEY) :: &
     rho_vp_crust_mantle,rho_vs_crust_mantle
   integer :: nspec2D_xmin_crust_mantle,nspec2D_xmax_crust_mantle, &
-            nspec2D_ymin_crust_mantle,nspec2D_ymax_crust_mantle
+             nspec2D_ymin_crust_mantle,nspec2D_ymax_crust_mantle
   integer, dimension(2,NSPEC2DMAX_YMIN_YMAX_CM) :: nimin_crust_mantle,nimax_crust_mantle,nkmin_eta_crust_mantle
   integer, dimension(2,NSPEC2DMAX_XMIN_XMAX_CM) :: njmin_crust_mantle,njmax_crust_mantle,nkmin_xi_crust_mantle
 
@@ -443,6 +460,14 @@
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NSPEC2D_670) ::  d670_kl, d670_kl_top, d670_kl_bot
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NSPEC2D_CMB) :: cmb_kl, cmb_kl_top, cmb_kl_bot
 
+  ! For saving kernels on a regular grid
+  integer :: npoints_slice
+  integer, dimension(NM_KL_REG_PTS_VAL) :: points_slice
+  integer, dimension(NM_KL_REG_PTS_VAL) :: ispec_reg
+  real, dimension(NGLLX, NM_KL_REG_PTS_VAL) :: hxir_reg
+  real, dimension(NGLLY, NM_KL_REG_PTS_VAL) :: hetar_reg
+  real, dimension(NGLLZ, NM_KL_REG_PTS_VAL) :: hgammar_reg
+
   ! NOISE_TOMOGRAPHY
   real(kind=CUSTOM_REAL), dimension(:,:,:,:,:), allocatable :: noise_sourcearray
   real(kind=CUSTOM_REAL), dimension(:), allocatable :: &
@@ -614,8 +639,8 @@
   ! Stacey
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_OUTER_CORE_STACEY) :: vp_outer_core
   integer :: nspec2D_xmin_outer_core,nspec2D_xmax_outer_core, &
-            nspec2D_ymin_outer_core,nspec2D_ymax_outer_core, &
-            nspec2D_zmin_outer_core
+             nspec2D_ymin_outer_core,nspec2D_ymax_outer_core, &
+             nspec2D_zmin_outer_core
   integer, dimension(2,NSPEC2DMAX_YMIN_YMAX_OC) :: nimin_outer_core,nimax_outer_core,nkmin_eta_outer_core
   integer, dimension(2,NSPEC2DMAX_XMIN_XMAX_OC) :: njmin_outer_core,njmax_outer_core,nkmin_xi_outer_core
 
@@ -624,7 +649,7 @@
      absorb_zmin_outer_core
 
   integer :: reclen_xmin_outer_core, reclen_xmax_outer_core, &
-            reclen_ymin_outer_core, reclen_ymax_outer_core
+             reclen_ymin_outer_core, reclen_ymax_outer_core
   integer :: reclen_zmin
 
   real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_OUTER_CORE_ADJOINT) :: &

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/update_displacement_Newmark.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/update_displacement_Newmark.f90	2013-08-30 00:30:39 UTC (rev 22745)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/update_displacement_Newmark.f90	2013-08-30 12:45:44 UTC (rev 22746)
@@ -66,7 +66,7 @@
 
   ! updates wavefields
   if( .not. GPU_MODE) then
-  ! on CPU
+    ! on CPU
 
     ! Newmark time scheme update
     ! mantle
@@ -78,54 +78,76 @@
     ! inner core
     call update_displ_elastic(NGLOB_INNER_CORE,displ_inner_core,veloc_inner_core,accel_inner_core, &
                                 deltat,deltatover2,deltatsqover2)
+  else
+    ! on GPU
+    ! Includes FORWARD_OR_ADJOINT == 1
+    ! outer core region
+    call it_update_displacement_oc_cuda(Mesh_pointer,deltat,deltatsqover2,deltatover2,1)
+    ! inner core region
+    call it_update_displacement_ic_cuda(Mesh_pointer,deltat,deltatsqover2,deltatover2,1)
+    ! crust/mantle region
+    call it_update_displacement_cm_cuda(Mesh_pointer,deltat,deltatsqover2,deltatover2,1)
+  endif
 
-    ! backward field
-    if (SIMULATION_TYPE == 3) then
-      ! mantle
-      call update_displ_elastic(NGLOB_CRUST_MANTLE,b_displ_crust_mantle,b_veloc_crust_mantle,b_accel_crust_mantle, &
-                                b_deltat,b_deltatover2,b_deltatsqover2)
-      ! outer core
-      call update_displ_acoustic(NGLOB_OUTER_CORE,b_displ_outer_core,b_veloc_outer_core,b_accel_outer_core, &
-                                b_deltat,b_deltatover2,b_deltatsqover2)
-      ! inner core
-      call update_displ_elastic(NGLOB_INNER_CORE,b_displ_inner_core,b_veloc_inner_core,b_accel_inner_core, &
-                                b_deltat,b_deltatover2,b_deltatsqover2)
-    endif
+  end subroutine update_displacement_Newmark
 
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+
+  subroutine update_displacement_Newmark_backward()
+
+  use specfem_par
+  use specfem_par_crustmantle
+  use specfem_par_innercore
+  use specfem_par_outercore
+
+  implicit none
+
+  ! checks
+  if( SIMULATION_TYPE /= 3 ) return
+
+  ! updates wavefields
+  if( .not. GPU_MODE) then
+    ! on CPU
+    ! Newmark time scheme update for backward/reconstructed fields
+    ! mantle
+    call update_displ_elastic(NGLOB_CRUST_MANTLE_ADJOINT,b_displ_crust_mantle,b_veloc_crust_mantle,b_accel_crust_mantle, &
+                              b_deltat,b_deltatover2,b_deltatsqover2)
+    ! outer core
+    call update_displ_acoustic(NGLOB_OUTER_CORE_ADJOINT,b_displ_outer_core,b_veloc_outer_core,b_accel_outer_core, &
+                              b_deltat,b_deltatover2,b_deltatsqover2)
+    ! inner core
+    call update_displ_elastic(NGLOB_INNER_CORE_ADJOINT,b_displ_inner_core,b_veloc_inner_core,b_accel_inner_core, &
+                              b_deltat,b_deltatover2,b_deltatsqover2)
   else
     ! on GPU
-    ! Includes SIM_TYPE 1 & 3
-
+    ! Includes FORWARD_OR_ADJOINT == 3
     ! outer core region
-    call it_update_displacement_oc_cuda(Mesh_pointer, &
-                                       deltat, deltatsqover2, deltatover2, &
-                                       b_deltat, b_deltatsqover2, b_deltatover2)
+    call it_update_displacement_oc_cuda(Mesh_pointer,b_deltat,b_deltatsqover2,b_deltatover2,3)
     ! inner core region
-    call it_update_displacement_ic_cuda(Mesh_pointer, &
-                                       deltat, deltatsqover2, deltatover2, &
-                                       b_deltat, b_deltatsqover2, b_deltatover2)
-
+    call it_update_displacement_ic_cuda(Mesh_pointer,b_deltat,b_deltatsqover2,b_deltatover2,3)
     ! crust/mantle region
-    call it_update_displacement_cm_cuda(Mesh_pointer, &
-                                       deltat, deltatsqover2, deltatover2, &
-                                       b_deltat, b_deltatsqover2, b_deltatover2)
+    call it_update_displacement_cm_cuda(Mesh_pointer,b_deltat,b_deltatsqover2,b_deltatover2,3)
   endif
 
-  end subroutine update_displacement_Newmark
+  end subroutine update_displacement_Newmark_backward
 
 !
 !-------------------------------------------------------------------------------------------------
 !
 
-  subroutine update_displ_elastic(nglob,displ,veloc,accel, &
+
+  subroutine update_displ_elastic(NGLOB,displ,veloc,accel, &
                                   deltat,deltatover2,deltatsqover2)
 
   use constants_solver,only: CUSTOM_REAL,NDIM,FORCE_VECTORIZATION_VAL
 
   implicit none
 
-  integer,intent(in) :: nglob
-  real(kind=CUSTOM_REAL),dimension(NDIM,nglob),intent(inout) :: displ,veloc,accel
+  integer,intent(in) :: NGLOB
+  real(kind=CUSTOM_REAL),dimension(NDIM,NGLOB),intent(inout) :: displ,veloc,accel
   real(kind=CUSTOM_REAL),intent(in) :: deltat,deltatover2,deltatsqover2
 
   ! local parameters
@@ -133,13 +155,13 @@
 
   ! Newmark time scheme update
   if(FORCE_VECTORIZATION_VAL) then
-    do i=1,nglob * NDIM
+    do i=1,NGLOB * NDIM
       displ(i,1) = displ(i,1) + deltat * veloc(i,1) + deltatsqover2 * accel(i,1)
       veloc(i,1) = veloc(i,1) + deltatover2 * accel(i,1)
       accel(i,1) = 0._CUSTOM_REAL
     enddo
   else
-    do i=1,nglob
+    do i=1,NGLOB
       displ(:,i) = displ(:,i) + deltat * veloc(:,i) + deltatsqover2 * accel(:,i)
       veloc(:,i) = veloc(:,i) + deltatover2 * accel(:,i)
       accel(:,i) = 0._CUSTOM_REAL
@@ -153,22 +175,22 @@
 !-------------------------------------------------------------------------------------------------
 !
 
-  subroutine update_displ_acoustic(nglob,displ,veloc,accel, &
+  subroutine update_displ_acoustic(NGLOB,displ,veloc,accel, &
                                    deltat,deltatover2,deltatsqover2)
 
   use constants,only: CUSTOM_REAL
 
   implicit none
 
-  integer,intent(in) :: nglob
-  real(kind=CUSTOM_REAL),dimension(nglob),intent(inout) :: displ,veloc,accel
+  integer,intent(in) :: NGLOB
+  real(kind=CUSTOM_REAL),dimension(NGLOB),intent(inout) :: displ,veloc,accel
   real(kind=CUSTOM_REAL),intent(in) :: deltat,deltatover2,deltatsqover2
 
   ! local parameters
   integer :: i
 
   ! Newmark time scheme update
-  do i=1,nglob
+  do i=1,NGLOB
     displ(i) = displ(i) + deltat * veloc(i) + deltatsqover2 * accel(i)
     veloc(i) = veloc(i) + deltatover2 * accel(i)
     accel(i) = 0._CUSTOM_REAL
@@ -218,17 +240,18 @@
 !
 
   subroutine update_accel_elastic(NGLOB,NGLOB_XY,veloc_crust_mantle,accel_crust_mantle, &
-                                           two_omega_earth, &
-                                           rmassx_crust_mantle,rmassy_crust_mantle,rmassz_crust_mantle, &
-                                           NCHUNKS_VAL,ABSORBING_CONDITIONS)
+                                  two_omega_earth, &
+                                  rmassx_crust_mantle,rmassy_crust_mantle,rmassz_crust_mantle)
 
 ! updates acceleration in crust/mantle region
 
-  use constants_solver,only: CUSTOM_REAL,NDIM
+  use constants_solver,only: CUSTOM_REAL,NDIM,NCHUNKS_VAL
 
+  use specfem_par,only: ABSORBING_CONDITIONS
+
   implicit none
 
-  integer :: NGLOB,NGLOB_XY,NCHUNKS_VAL
+  integer :: NGLOB,NGLOB_XY
 
   ! velocity & acceleration
   ! crust/mantle region
@@ -248,8 +271,6 @@
 
   real(kind=CUSTOM_REAL) :: two_omega_earth
 
-  logical :: ABSORBING_CONDITIONS
-
   ! local parameters
   integer :: i
 
@@ -285,12 +306,12 @@
 !
 
   subroutine update_veloc_elastic(NGLOB_CM,veloc_crust_mantle,accel_crust_mantle, &
-                                            NGLOB_IC,veloc_inner_core,accel_inner_core, &
-                                            deltatover2,two_omega_earth,rmass_inner_core)
+                                  NGLOB_IC,veloc_inner_core,accel_inner_core, &
+                                  deltatover2,two_omega_earth,rmass_inner_core)
 
 ! updates velocity in crust/mantle region, and acceleration and velocity in inner core
 
-  use constants_solver,only: CUSTOM_REAL,NDIM
+  use constants_solver,only: CUSTOM_REAL,NDIM,FORCE_VECTORIZATION_VAL
 
   implicit none
 
@@ -320,19 +341,38 @@
   !         needs both, acceleration update & velocity corrector terms
 
   ! mantle
-  do i=1,NGLOB_CM
-    veloc_crust_mantle(:,i) = veloc_crust_mantle(:,i) + deltatover2*accel_crust_mantle(:,i)
-  enddo
+  if(FORCE_VECTORIZATION_VAL) then
+    do i=1,NGLOB_CM * NDIM
+      veloc_crust_mantle(i,i) = veloc_crust_mantle(i,i) + deltatover2*accel_crust_mantle(i,i)
+    enddo
+  else
+    do i=1,NGLOB_CM
+      veloc_crust_mantle(:,i) = veloc_crust_mantle(:,i) + deltatover2*accel_crust_mantle(:,i)
+    enddo
+  endif
 
   ! inner core
-  do i=1,NGLOB_IC
-    accel_inner_core(1,i) = accel_inner_core(1,i)*rmass_inner_core(i) &
-           + two_omega_earth*veloc_inner_core(2,i)
-    accel_inner_core(2,i) = accel_inner_core(2,i)*rmass_inner_core(i) &
-           - two_omega_earth*veloc_inner_core(1,i)
-    accel_inner_core(3,i) = accel_inner_core(3,i)*rmass_inner_core(i)
+  if(FORCE_VECTORIZATION_VAL) then
+    do i=1,NGLOB_IC
+      accel_inner_core(1,i) = accel_inner_core(1,i)*rmass_inner_core(i) &
+             + two_omega_earth*veloc_inner_core(2,i)
+      accel_inner_core(2,i) = accel_inner_core(2,i)*rmass_inner_core(i) &
+             - two_omega_earth*veloc_inner_core(1,i)
+      accel_inner_core(3,i) = accel_inner_core(3,i)*rmass_inner_core(i)
+    enddo
+    do i=1,NGLOB_IC * NDIM
+      veloc_inner_core(i,1) = veloc_inner_core(i,1) + deltatover2*accel_inner_core(i,1)
+    enddo
+  else
+    do i=1,NGLOB_IC
+      accel_inner_core(1,i) = accel_inner_core(1,i)*rmass_inner_core(i) &
+             + two_omega_earth*veloc_inner_core(2,i)
+      accel_inner_core(2,i) = accel_inner_core(2,i)*rmass_inner_core(i) &
+             - two_omega_earth*veloc_inner_core(1,i)
+      accel_inner_core(3,i) = accel_inner_core(3,i)*rmass_inner_core(i)
 
-    veloc_inner_core(:,i) = veloc_inner_core(:,i) + deltatover2*accel_inner_core(:,i)
-  enddo
+      veloc_inner_core(:,i) = veloc_inner_core(:,i) + deltatover2*accel_inner_core(:,i)
+    enddo
+  endif
 
   end subroutine update_veloc_elastic